diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py
index 2bcc34d76..c77919559 100644
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@@ -669,6 +669,8 @@ def _find_page_size(
if not isinstance(item, DocItem):
continue
for provenance in item.prov:
+ if not isinstance(provenance, ProvenanceItem):
+ continue
bbox = provenance.bbox
left = min(left, bbox.l) if left != -1 else bbox.l
right = max(right, bbox.r) if right != -1 else bbox.r
diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py
index 2a7d02ce7..e61cd0f34 100644
--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -1,8 +1,9 @@
+import copy
import logging
-import re
+from dataclasses import dataclass, field
from io import BytesIO
from pathlib import Path
-from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+from typing import Literal, Optional, Union
from docling_core.types.doc import (
ContentLayer,
@@ -10,12 +11,20 @@
DoclingDocument,
DocumentOrigin,
Formatting,
- GroupLabel,
- NodeItem,
+ ProvenanceTrack,
)
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic.types import StringConstraints
-from typing_extensions import Self, override
+from docling_core.types.doc.webvtt import (
+ WebVTTCueBoldSpan,
+ WebVTTCueComponent,
+ WebVTTCueComponentWithTerminator,
+ WebVTTCueItalicSpan,
+ WebVTTCueLanguageSpan,
+ WebVTTCueTextSpan,
+ WebVTTCueUnderlineSpan,
+ WebVTTCueVoiceSpan,
+ WebVTTFile,
+)
+from typing_extensions import override
from docling.backend.abstract_backend import DeclarativeDocumentBackend
from docling.datamodel.base_models import InputFormat
@@ -24,409 +33,29 @@
_log = logging.getLogger(__name__)
-class _WebVTTTimestamp(BaseModel):
- """Model representing a WebVTT timestamp.
-
- A WebVTT timestamp is always interpreted relative to the current playback position
- of the media data that the WebVTT file is to be synchronized with.
- """
-
- model_config = ConfigDict(regex_engine="python-re")
-
- raw: Annotated[
- str,
- Field(
- description="A representation of the WebVTT Timestamp as a single string"
- ),
- ]
-
- _pattern: ClassVar[re.Pattern] = re.compile(
- r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
- )
- _hours: int
- _minutes: int
- _seconds: int
- _millis: int
-
- @model_validator(mode="after")
- def validate_raw(self) -> Self:
- m = self._pattern.match(self.raw)
- if not m:
- raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
- self._hours = int(m.group(1)) if m.group(1) else 0
- self._minutes = int(m.group(2))
- self._seconds = int(m.group(3))
- self._millis = int(m.group(4))
-
- if self._minutes < 0 or self._minutes > 59:
- raise ValueError("Minutes must be between 0 and 59")
- if self._seconds < 0 or self._seconds > 59:
- raise ValueError("Seconds must be between 0 and 59")
-
- return self
-
- @property
- def seconds(self) -> float:
- """A representation of the WebVTT Timestamp in seconds"""
- return (
- self._hours * 3600
- + self._minutes * 60
- + self._seconds
- + self._millis / 1000.0
- )
-
- @override
- def __str__(self) -> str:
- return self.raw
-
-
-_WebVTTCueIdentifier = Annotated[
- str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
-
-
-class _WebVTTCueTimings(BaseModel):
- """Model representating WebVTT cue timings."""
-
- start: Annotated[
- _WebVTTTimestamp, Field(description="Start time offset of the cue")
- ]
- end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
-
- @model_validator(mode="after")
- def check_order(self) -> Self:
- if self.start and self.end:
- if self.end.seconds <= self.start.seconds:
- raise ValueError("End timestamp must be greater than start timestamp")
- return self
-
- @override
- def __str__(self):
- return f"{self.start} --> {self.end}"
-
-
-class _WebVTTCueTextSpan(BaseModel):
- """Model representing a WebVTT cue text span."""
-
+@dataclass
+class AnnotatedText:
text: str
- span_type: Literal["text"] = "text"
-
- @field_validator("text", mode="after")
- @classmethod
- def validate_text(cls, value: str) -> str:
- if any(ch in value for ch in {"\n", "\r", "&", "<"}):
- raise ValueError("Cue text span contains invalid characters")
- if len(value) == 0:
- raise ValueError("Cue text span cannot be empty")
- return value
-
- @override
- def __str__(self):
- return self.text
-
-
-class _WebVTTCueVoiceSpan(BaseModel):
- """Model representing a WebVTT cue voice span."""
-
- annotation: Annotated[
- str,
- Field(
- description=(
- "Cue span start tag annotation text representing the name of thevoice"
- )
- ),
- ]
- classes: Annotated[
- list[str],
- Field(description="List of classes representing the cue span's significance"),
- ] = []
- components: Annotated[
- list["_WebVTTCueComponent"],
- Field(description="The components representing the cue internal text"),
- ] = []
- span_type: Literal["v"] = "v"
-
- @field_validator("annotation", mode="after")
- @classmethod
- def validate_annotation(cls, value: str) -> str:
- if any(ch in value for ch in {"\n", "\r", "&", ">"}):
- raise ValueError(
- "Cue span start tag annotation contains invalid characters"
- )
- if not value:
- raise ValueError("Cue text span cannot be empty")
- return value
-
- @field_validator("classes", mode="after")
- @classmethod
- def validate_classes(cls, value: list[str]) -> list[str]:
- for item in value:
- if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
- raise ValueError(
- "A cue span start tag class contains invalid characters"
- )
- if not item:
- raise ValueError("Cue span start tag classes cannot be empty")
- return value
-
- @override
- def __str__(self):
- tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
- inner = "".join(str(span) for span in self.components)
- return f"<{tag} {self.annotation}>{inner}"
-
-
-class _WebVTTCueClassSpan(BaseModel):
- span_type: Literal["c"] = "c"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-class _WebVTTCueItalicSpan(BaseModel):
- span_type: Literal["i"] = "i"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-class _WebVTTCueBoldSpan(BaseModel):
- span_type: Literal["b"] = "b"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-class _WebVTTCueUnderlineSpan(BaseModel):
- span_type: Literal["u"] = "u"
- components: list["_WebVTTCueComponent"]
-
- @override
- def __str__(self):
- inner = "".join(str(span) for span in self.components)
- return f"{inner}"
-
-
-_WebVTTCueComponent = Annotated[
- Union[
- _WebVTTCueTextSpan,
- _WebVTTCueClassSpan,
- _WebVTTCueItalicSpan,
- _WebVTTCueBoldSpan,
- _WebVTTCueUnderlineSpan,
- _WebVTTCueVoiceSpan,
- ],
- Field(discriminator="span_type", description="The WebVTT cue component"),
-]
-
-
-class _WebVTTCueBlock(BaseModel):
- """Model representing a WebVTT cue block.
-
- The optional WebVTT cue settings list is not supported.
- The cue payload is limited to the following spans: text, class, italic, bold,
- underline, and voice.
- """
-
- model_config = ConfigDict(regex_engine="python-re")
-
- identifier: Optional[_WebVTTCueIdentifier] = Field(
- None, description="The WebVTT cue identifier"
+ voice: Optional[str] = None
+ formatting: Optional[Formatting] = None
+ classes: dict[Literal["b", "u", "i", "lang", "v"], list[str]] = field(
+ default_factory=dict
)
- timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
- payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
-
- _pattern_block: ClassVar[re.Pattern] = re.compile(
- r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
- )
- _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
- r"^\.[^\t\n\r &<>]+)?" # zero or more classes
- r"[ \t]+(?P[^\n\r&>]+)>" # required space and annotation
- )
-
- @field_validator("payload", mode="after")
- @classmethod
- def validate_payload(cls, payload):
- for voice in payload:
- if "-->" in str(voice):
- raise ValueError("Cue payload must not contain '-->'")
- return payload
-
- @classmethod
- def parse(cls, raw: str) -> "_WebVTTCueBlock":
- lines = raw.strip().splitlines()
- if not lines:
- raise ValueError("Cue block must have at least one line")
- identifier: Optional[_WebVTTCueIdentifier] = None
- timing_line = lines[0]
- if "-->" not in timing_line and len(lines) > 1:
- identifier = timing_line
- timing_line = lines[1]
- cue_lines = lines[2:]
- else:
- cue_lines = lines[1:]
-
- if "-->" not in timing_line:
- raise ValueError("Cue block must contain WebVTT cue timings")
-
- start, end = [t.strip() for t in timing_line.split("-->")]
- end = re.split(" |\t", end)[0] # ignore the cue settings list
- timings: _WebVTTCueTimings = _WebVTTCueTimings(
- start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+ lang: set[str] = field(default_factory=set)
+
+ def copy_meta(self, text):
+ return AnnotatedText(
+ text=text,
+ voice=self.voice,
+ formatting=self.formatting.model_copy() if self.formatting else None,
+ classes=copy.deepcopy(self.classes),
+ lang=self.lang.copy(),
)
- cue_text = " ".join(cue_lines).strip()
- if cue_text.startswith("" not in cue_text:
- # adding close tag for cue voice spans without end tag
- cue_text += ""
-
- stack: list[list[_WebVTTCueComponent]] = [[]]
- tag_stack: list[Union[str, tuple]] = []
-
- pos = 0
- matches = list(cls._pattern_block.finditer(cue_text))
- i = 0
- while i < len(matches):
- match = matches[i]
- if match.start() > pos:
- stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
- tag = match.group(0)
-
- if tag.startswith(("", "", "", "")):
- tag_type = tag[1:2]
- tag_stack.append(tag_type)
- stack.append([])
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueItalicSpan(components=children))
- tag_stack.pop()
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueBoldSpan(components=children))
- tag_stack.pop()
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
- tag_stack.pop()
- elif tag == "":
- children = stack.pop()
- stack[-1].append(_WebVTTCueClassSpan(components=children))
- tag_stack.pop()
- elif tag.startswith(""))
- else:
- parts.append(str(span))
-
- return "".join(parts)
-
-
-class _WebVTTFile(BaseModel):
- """A model representing a WebVTT file."""
-
- cue_blocks: list[_WebVTTCueBlock]
-
- @staticmethod
- def verify_signature(content: str) -> bool:
- if not content:
- return False
- elif len(content) == 6:
- return content == "WEBVTT"
- elif len(content) > 6 and content.startswith("WEBVTT"):
- return content[6] in (" ", "\t", "\n")
- else:
- return False
-
- @classmethod
- def parse(cls, raw: str) -> "_WebVTTFile":
- # Normalize newlines to LF
- raw = raw.replace("\r\n", "\n").replace("\r", "\n")
-
- # Check WebVTT signature
- if not cls.verify_signature(raw):
- raise ValueError("Invalid WebVTT file signature")
-
- # Strip "WEBVTT" header line
- lines = raw.split("\n", 1)
- body = lines[1] if len(lines) > 1 else ""
-
- # Remove NOTE/STYLE/REGION blocks
- body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
- body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
-
- # Split into cue blocks
- raw_blocks = re.split(r"\n\s*\n", body.strip())
- cues: list[_WebVTTCueBlock] = []
- for block in raw_blocks:
- try:
- cues.append(_WebVTTCueBlock.parse(block))
- except ValueError as e:
- _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
-
- return cls(cue_blocks=cues)
-
- def __iter__(self):
- return iter(self.cue_blocks)
- def __getitem__(self, idx):
- return self.cue_blocks[idx]
- def __len__(self):
- return len(self.cue_blocks)
+@dataclass
+class AnnotatedPar:
+ items: list[AnnotatedText]
class WebVTTDocumentBackend(DeclarativeDocumentBackend):
@@ -458,7 +87,7 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
@override
def is_valid(self) -> bool:
- return _WebVTTFile.verify_signature(self.content)
+ return WebVTTFile.verify_signature(self.content)
@classmethod
@override
@@ -477,36 +106,18 @@ def supported_formats(cls) -> set[InputFormat]:
return {InputFormat.VTT}
@staticmethod
- def _add_text_from_component(
- doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
+ def _add_classes(
+ item: AnnotatedText,
+ key: Literal["b", "u", "i", "lang", "v"],
+ classes: list[str],
) -> None:
- """Adds a TextItem to a document by extracting text from a cue span component.
-
- TODO: address nesting
- """
- formatting = Formatting()
- text = ""
- if isinstance(item, _WebVTTCueItalicSpan):
- formatting.italic = True
- elif isinstance(item, _WebVTTCueBoldSpan):
- formatting.bold = True
- elif isinstance(item, _WebVTTCueUnderlineSpan):
- formatting.underline = True
- if isinstance(item, _WebVTTCueTextSpan):
- text = item.text
- else:
- # TODO: address nesting
- text = "".join(
- [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
- )
- if text := text.strip():
- doc.add_text(
- label=DocItemLabel.TEXT,
- text=text,
- parent=parent,
- content_layer=ContentLayer.BODY,
- formatting=formatting,
- )
+ if not classes:
+ return
+
+ bucket = item.classes.setdefault(key, [])
+ for cls in classes:
+ if cls not in bucket:
+ bucket.append(cls)
@override
def convert(self) -> DoclingDocument:
@@ -521,52 +132,115 @@ def convert(self) -> DoclingDocument:
)
doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
- vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
- for block in vtt.cue_blocks:
- block_group = doc.add_group(
- label=GroupLabel.SECTION,
- name="WebVTT cue block",
- parent=None,
- content_layer=ContentLayer.BODY,
- )
- if block.identifier:
- doc.add_text(
- label=DocItemLabel.TEXT,
- text=str(block.identifier),
- parent=block_group,
- content_layer=ContentLayer.BODY,
+ vtt: WebVTTFile = WebVTTFile.parse(self.content)
+ cue_text: list[AnnotatedPar] = []
+ parents: list[AnnotatedText] = []
+
+ def _extract_components(
+ payload: list[WebVTTCueComponentWithTerminator],
+ ) -> None:
+ nonlocal cue_text, parents
+ if not cue_text:
+ cue_text.append(AnnotatedPar(items=[]))
+ par = cue_text[-1]
+ for comp in payload:
+ item: AnnotatedText = (
+ parents[-1].copy_meta("") if parents else AnnotatedText(text="")
)
+ component: WebVTTCueComponent = comp.component
+ if isinstance(component, WebVTTCueTextSpan):
+ item.text = component.text
+ par.items.append(item)
+ else:
+ # configure metadata based on span type
+ if isinstance(component, WebVTTCueBoldSpan):
+ item.formatting = item.formatting or Formatting()
+ item.formatting.bold = True
+ self._add_classes(item, "b", component.start_tag.classes)
+
+ elif isinstance(component, WebVTTCueItalicSpan):
+ item.formatting = item.formatting or Formatting()
+ item.formatting.italic = True
+ self._add_classes(item, "i", component.start_tag.classes)
+
+ elif isinstance(component, WebVTTCueUnderlineSpan):
+ item.formatting = item.formatting or Formatting()
+ item.formatting.underline = True
+ self._add_classes(item, "u", component.start_tag.classes)
+
+ elif isinstance(component, WebVTTCueLanguageSpan):
+ item.lang.add(component.start_tag.annotation)
+ self._add_classes(item, "lang", component.start_tag.classes)
+
+ elif isinstance(component, WebVTTCueVoiceSpan):
+ # voice spans cannot be embedded
+ item.voice = component.start_tag.annotation
+ self._add_classes(item, "v", component.start_tag.classes)
+
+ parents.append(item)
+ _extract_components(component.internal_text.components)
+ parents.pop()
+
+ if comp.terminator is not None:
+ cue_text.append(AnnotatedPar(items=[]))
+ par = cue_text[-1]
+
+ def _add_text_item(
+ text: str,
+ formatting: Optional[Formatting],
+ item: AnnotatedText,
+ parent=None,
+ ):
+ languages = list(item.lang) if item.lang else None
+ classes = (
+ [".".join([k, *v]) for k, v in item.classes.items()]
+ if item.classes
+ else None
+ )
+
+ track = ProvenanceTrack(
+ start_time=block.timings.start.seconds,
+ end_time=block.timings.end.seconds,
+ identifier=identifier,
+ languages=languages,
+ classes=classes,
+ voice=item.voice or None,
+ )
+
doc.add_text(
label=DocItemLabel.TEXT,
- text=str(block.timings),
- parent=block_group,
+ text=text,
content_layer=ContentLayer.BODY,
+ prov=track,
+ formatting=formatting,
+ parent=parent,
)
- for cue_span in block.payload:
- if isinstance(cue_span, _WebVTTCueVoiceSpan):
- voice_group = doc.add_group(
- label=GroupLabel.INLINE,
- name="WebVTT cue voice span",
- parent=block_group,
- content_layer=ContentLayer.BODY,
- )
- voice = cue_span.annotation
- if classes := cue_span.classes:
- voice += f" ({', '.join(classes)})"
- voice += ": "
- doc.add_text(
- label=DocItemLabel.TEXT,
- text=voice,
- parent=voice_group,
- content_layer=ContentLayer.BODY,
+
+ for block in vtt.cue_blocks:
+ cue_text = []
+ parents = []
+ identifier = str(block.identifier) if block.identifier else None
+ _extract_components(block.payload)
+ for par in cue_text:
+ if not par.items:
+ continue
+ if len(par.items) == 1:
+ item = par.items[0]
+ _add_text_item(
+ text=item.text,
+ formatting=item.formatting,
+ item=item,
)
- for item in cue_span.components:
- WebVTTDocumentBackend._add_text_from_component(
- doc, item, voice_group
- )
else:
- WebVTTDocumentBackend._add_text_from_component(
- doc, cue_span, block_group
+ group = doc.add_inline_group(
+ "WebVTT cue span", content_layer=ContentLayer.BODY
)
+ for item in par.items:
+ _add_text_item(
+ text=item.text,
+ formatting=item.formatting,
+ item=item,
+ parent=group,
+ )
return doc
diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
index db73db8db..70434fd8d 100644
--- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
+++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
@@ -12,8 +12,7 @@
from pathlib import Path
from typing import TYPE_CHECKING, List, Optional, Union, cast
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DocTagsDocument
+from docling_core.types.doc import DoclingDocument, DocTagsDocument, ProvenanceItem
from PIL import Image as PILImage
if TYPE_CHECKING:
@@ -371,13 +370,17 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
page_map = {p.page_no: p for p in conv_res.pages}
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
- if not isinstance(element, DocItem) or len(element.prov) == 0:
+ if (
+ not isinstance(element, DocItem)
+ or not element.prov
+ or not isinstance(prov := element.prov[0], ProvenanceItem)
+ ):
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
):
- page_no = element.prov[0].page_no
+ page_no = prov.page_no
page = page_map.get(page_no)
if page is None:
_log.warning(
@@ -387,10 +390,8 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
assert page.size is not None
assert page.image is not None
- crop_bbox = (
- element.prov[0]
- .bbox.scaled(scale=scale)
- .to_top_left_origin(page_height=page.size.height * scale)
+ crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin(
+ page_height=page.size.height * scale
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index c69b5018b..dae4ee92d 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -10,6 +10,7 @@
DoclingDocument,
NodeItem,
PictureItem,
+ ProvenanceItem,
)
from PIL.Image import Image
from typing_extensions import TypeVar
@@ -199,6 +200,8 @@ def prepare_element(
return None
# Crop the image form the page
+ if not isinstance(element.prov[0], ProvenanceItem):
+ return None
element_prov = element.prov[0]
bbox = element_prov.bbox
width = bbox.r - bbox.l
diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py
index 055c74b1f..3643bd9ff 100644
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@@ -7,6 +7,7 @@
DoclingDocument,
NodeItem,
PictureItem,
+ ProvenanceItem,
)
from docling_core.types.doc.document import ( # TODO: move import to docling_core.types.doc
PictureDescriptionData,
@@ -64,8 +65,8 @@ def __call__(
assert isinstance(el.item, PictureItem)
describe_image = True
# Don't describe the image if it's smaller than the threshold
- if len(el.item.prov) > 0:
- prov = el.item.prov[0] # PictureItems have at most a single provenance
+ if el.item.prov and isinstance(prov := el.item.prov[0], ProvenanceItem):
+ # PictureItems have at most a single provenance
page = doc.pages.get(prov.page_no)
if page is not None:
page_area = page.size.width * page.size.height
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
index 2bb94e42a..8b2f47092 100644
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -1,47 +1,35 @@
import logging
-import os
-import re
import sys
import tempfile
from io import BytesIO
from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Union, cast
-
-from docling_core.types.doc import DoclingDocument, DocumentOrigin
-
-# import whisper # type: ignore
-# import librosa
-# import numpy as np
-# import soundfile as sf # type: ignore
-from docling_core.types.doc.labels import DocItemLabel
-from pydantic import BaseModel, Field, validator
+from typing import Optional, Union
+
+from docling_core.types.doc import (
+ ContentLayer,
+ DocItemLabel,
+ DoclingDocument,
+ DocumentOrigin,
+ ProvenanceTrack,
+)
+from pydantic import BaseModel, Field
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.noop_backend import NoOpBackend
-
-# from pydub import AudioSegment # type: ignore
-# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
from docling.datamodel.accelerator_options import (
AcceleratorOptions,
)
from docling.datamodel.base_models import (
ConversionStatus,
- FormatToMimeType,
)
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
from docling.datamodel.pipeline_options import (
AsrPipelineOptions,
)
from docling.datamodel.pipeline_options_asr_model import (
InlineAsrMlxWhisperOptions,
InlineAsrNativeWhisperOptions,
- # AsrResponseFormat,
- InlineAsrOptions,
)
-from docling.datamodel.pipeline_options_vlm_model import (
- InferenceFramework,
-)
-from docling.datamodel.settings import settings
from docling.pipeline.base_pipeline import BasePipeline
from docling.utils.accelerator_utils import decide_device
from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
)
for citem in conversation:
+ prov: ProvenanceTrack = ProvenanceTrack(
+ start_time=citem.start_time,
+ end_time=citem.end_time,
+ voice=citem.speaker,
+ )
conv_res.document.add_text(
- label=DocItemLabel.TEXT, text=citem.to_string()
+ label=DocItemLabel.TEXT,
+ text=citem.text,
+ prov=prov,
+ content_layer=ContentLayer.BODY,
)
return conv_res
@@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
)
for citem in conversation:
+ prov: ProvenanceTrack = ProvenanceTrack(
+ start_time=citem.start_time,
+ end_time=citem.end_time,
+ voice=citem.speaker,
+ )
conv_res.document.add_text(
- label=DocItemLabel.TEXT, text=citem.to_string()
+ label=DocItemLabel.TEXT,
+ text=citem.text,
+ prov=prov,
+ content_layer=ContentLayer.BODY,
)
conv_res.status = ConversionStatus.SUCCESS
diff --git a/docling/pipeline/legacy_standard_pdf_pipeline.py b/docling/pipeline/legacy_standard_pdf_pipeline.py
index 55c2703cd..ceca82db9 100644
--- a/docling/pipeline/legacy_standard_pdf_pipeline.py
+++ b/docling/pipeline/legacy_standard_pdf_pipeline.py
@@ -4,7 +4,13 @@
from typing import Optional, cast
import numpy as np
-from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+from docling_core.types.doc import (
+ DocItem,
+ ImageRef,
+ PictureItem,
+ ProvenanceItem,
+ TableItem,
+)
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
@@ -181,7 +187,11 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
- if not isinstance(element, DocItem) or len(element.prov) == 0:
+ if (
+ not isinstance(element, DocItem)
+ or not element.prov
+ or not isinstance(prov := element.prov[0], ProvenanceItem)
+ ):
continue
if (
isinstance(element, PictureItem)
@@ -190,7 +200,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
- page_ix = element.prov[0].page_no - 1
+ page_ix = prov.page_no - 1
page = next(
(p for p in conv_res.pages if p.page_no == page_ix),
cast("Page", None),
@@ -199,13 +209,9 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
assert page.size is not None
assert page.image is not None
- crop_bbox = (
- element.prov[0]
- .bbox.scaled(scale=scale)
- .to_top_left_origin(
- page_height=page.size.height * scale
- )
- )
+ crop_bbox = prov.bbox.scaled(
+ scale=scale
+ ).to_top_left_origin(page_height=page.size.height * scale)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 585c548c6..54def080e 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -26,7 +26,13 @@
from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
import numpy as np
-from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+from docling_core.types.doc import (
+ DocItem,
+ ImageRef,
+ PictureItem,
+ ProvenanceItem,
+ TableItem,
+)
from docling.backend.abstract_backend import AbstractDocumentBackend
from docling.backend.pdf_backend import PdfDocumentBackend
@@ -760,7 +766,11 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
):
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
- if not isinstance(element, DocItem) or len(element.prov) == 0:
+ if (
+ not isinstance(element, DocItem)
+ or not element.prov
+ or not isinstance(prov := element.prov[0], ProvenanceItem)
+ ):
continue
if (
isinstance(element, PictureItem)
@@ -769,7 +779,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
isinstance(element, TableItem)
and self.pipeline_options.generate_table_images
):
- page_ix = element.prov[0].page_no - 1
+ page_ix = prov.page_no - 1
page = next(
(p for p in conv_res.pages if p.page_no == page_ix),
cast("Page", None),
@@ -778,13 +788,9 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
assert page.size is not None
assert page.image is not None
- crop_bbox = (
- element.prov[0]
- .bbox.scaled(scale=scale)
- .to_top_left_origin(
- page_height=page.size.height * scale
- )
- )
+ crop_bbox = prov.bbox.scaled(
+ scale=scale
+ ).to_top_left_origin(page_height=page.size.height * scale)
cropped_im = page.image.crop(crop_bbox.as_tuple())
element.image = ImageRef.from_pil(
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index ab919c4d9..73831fc49 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -165,21 +165,23 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
if self.pipeline_options.generate_picture_images:
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
- if not isinstance(element, DocItem) or len(element.prov) == 0:
+ if (
+ not isinstance(element, DocItem)
+ or not element.prov
+ or not isinstance(prov := element.prov[0], ProvenanceItem)
+ ):
continue
if (
isinstance(element, PictureItem)
and self.pipeline_options.generate_picture_images
):
- page_ix = element.prov[0].page_no - 1
+ page_ix = prov.page_no - 1
page = conv_res.pages[page_ix]
assert page.size is not None
assert page.image is not None
- crop_bbox = (
- element.prov[0]
- .bbox.scaled(scale=scale)
- .to_top_left_origin(page_height=page.size.height * scale)
+ crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin(
+ page_height=page.size.height * scale
)
cropped_im = page.image.crop(crop_bbox.as_tuple())
@@ -216,12 +218,14 @@ def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
if self.force_backend_text:
scale = self.pipeline_options.images_scale
for element, _level in conv_res.document.iterate_items():
- if not isinstance(element, TextItem) or len(element.prov) == 0:
+ if (
+ not isinstance(element, TextItem)
+ or not element.prov
+ or not isinstance(prov := element.prov[0], ProvenanceItem)
+ ):
continue
- crop_bbox = (
- element.prov[0]
- .bbox.scaled(scale=scale)
- .to_top_left_origin(page_height=page.size.height * scale)
+ crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin(
+ page_height=page.size.height * scale
)
txt = self.extract_text_from_backend(page, crop_bbox)
element.text = txt
diff --git a/pyproject.toml b/pyproject.toml
index 8dc239382..8444ac4b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,6 @@ authors = [
requires-python = '>=3.9,<4.0'
dependencies = [
'pydantic (>=2.0.0,<3.0.0)',
- 'docling-core[chunking] (>=2.50.1,<3.0.0)',
'docling-parse (>=4.7.0,<5.0.0)',
"docling-ibm-models>=3.9.1,<4",
'filetype (>=1.2.0,<2.0.0)',
@@ -74,6 +73,7 @@ dependencies = [
# 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
"accelerate>=1.0.0,<2",
"polyfactory>=2.22.2",
+ "docling-core[chunking]",
]
[project.urls]
@@ -160,6 +160,9 @@ constraints = [
package = true
default-groups = "all"
+[tool.uv.sources]
+docling-core = { git = "ssh://git@github.com/docling-project/docling-core.git", rev = "c75516516358f25add2682674fc7dc6eef2c5164" }
+
[tool.setuptools.packages.find]
include = ["docling*"]
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
index d7840e994..db52ba1b7 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -1,66 +1,14 @@
item-0 at level 0: unspecified: group _root_
- item-1 at level 1: section: group WebVTT cue block
- item-2 at level 2: text: 00:11.000 --> 00:13.000
- item-3 at level 2: inline: group WebVTT cue voice span
- item-4 at level 3: text: Roger Bingham:
- item-5 at level 3: text: We are in New York City
- item-6 at level 1: section: group WebVTT cue block
- item-7 at level 2: text: 00:13.000 --> 00:16.000
- item-8 at level 2: inline: group WebVTT cue voice span
- item-9 at level 3: text: Roger Bingham:
- item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
- item-11 at level 1: section: group WebVTT cue block
- item-12 at level 2: text: 00:16.000 --> 00:18.000
- item-13 at level 2: inline: group WebVTT cue voice span
- item-14 at level 3: text: Roger Bingham:
- item-15 at level 3: text: from the American Museum of Natural History
- item-16 at level 1: section: group WebVTT cue block
- item-17 at level 2: text: 00:18.000 --> 00:20.000
- item-18 at level 2: inline: group WebVTT cue voice span
- item-19 at level 3: text: Roger Bingham:
- item-20 at level 3: text: And with me is Neil deGrasse Tyson
- item-21 at level 1: section: group WebVTT cue block
- item-22 at level 2: text: 00:20.000 --> 00:22.000
- item-23 at level 2: inline: group WebVTT cue voice span
- item-24 at level 3: text: Roger Bingham:
- item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
- item-26 at level 1: section: group WebVTT cue block
- item-27 at level 2: text: 00:22.000 --> 00:24.000
- item-28 at level 2: inline: group WebVTT cue voice span
- item-29 at level 3: text: Roger Bingham:
- item-30 at level 3: text: at the AMNH.
- item-31 at level 1: section: group WebVTT cue block
- item-32 at level 2: text: 00:24.000 --> 00:26.000
- item-33 at level 2: inline: group WebVTT cue voice span
- item-34 at level 3: text: Roger Bingham:
- item-35 at level 3: text: Thank you for walking down here.
- item-36 at level 1: section: group WebVTT cue block
- item-37 at level 2: text: 00:27.000 --> 00:30.000
- item-38 at level 2: inline: group WebVTT cue voice span
- item-39 at level 3: text: Roger Bingham:
- item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
- item-41 at level 1: section: group WebVTT cue block
- item-42 at level 2: text: 00:30.000 --> 00:31.500
- item-43 at level 2: inline: group WebVTT cue voice span
- item-44 at level 3: text: Roger Bingham:
- item-45 at level 3: text: When we e-mailed—
- item-46 at level 1: section: group WebVTT cue block
- item-47 at level 2: text: 00:30.500 --> 00:32.500
- item-48 at level 2: inline: group WebVTT cue voice span
- item-49 at level 3: text: Neil deGrasse Tyson:
- item-50 at level 3: text: Didn’t we talk about enough in that conversation?
- item-51 at level 1: section: group WebVTT cue block
- item-52 at level 2: text: 00:32.000 --> 00:35.500
- item-53 at level 2: inline: group WebVTT cue voice span
- item-54 at level 3: text: Roger Bingham:
- item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
- item-56 at level 1: section: group WebVTT cue block
- item-57 at level 2: text: 00:32.500 --> 00:33.500
- item-58 at level 2: inline: group WebVTT cue voice span
- item-59 at level 3: text: Neil deGrasse Tyson:
- item-60 at level 3: text: Laughs
- item-61 at level 1: section: group WebVTT cue block
- item-62 at level 2: text: 00:35.500 --> 00:38.000
- item-63 at level 2: inline: group WebVTT cue voice span
- item-64 at level 3: text: Roger Bingham:
- item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+ item-1 at level 1: text: We are in New York City
+ item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street
+ item-3 at level 1: text: from the American Museum of Natural History
+ item-4 at level 1: text: And with me is Neil deGrasse Tyson
+ item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium
+ item-6 at level 1: text: at the AMNH.
+ item-7 at level 1: text: Thank you for walking down here.
+ item-8 at level 1: text: And I want to do a follow-up on the last conversation we did.
+ item-9 at level 1: text: When we e-mailed—
+ item-10 at level 1: text: Didn’t we talk about enough in that conversation?
+ item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos
+ item-12 at level 1: text: Laughs
+ item-13 at level 1: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
index 831182560..5a7c9d29b 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "webvtt_example_01",
"origin": {
"mimetype": "text/vtt",
@@ -18,1052 +18,291 @@
"self_ref": "#/body",
"children": [
{
- "$ref": "#/groups/0"
+ "$ref": "#/texts/0"
},
{
- "$ref": "#/groups/2"
+ "$ref": "#/texts/1"
},
{
- "$ref": "#/groups/4"
+ "$ref": "#/texts/2"
},
{
- "$ref": "#/groups/6"
+ "$ref": "#/texts/3"
},
{
- "$ref": "#/groups/8"
+ "$ref": "#/texts/4"
},
{
- "$ref": "#/groups/10"
+ "$ref": "#/texts/5"
},
{
- "$ref": "#/groups/12"
+ "$ref": "#/texts/6"
},
{
- "$ref": "#/groups/14"
+ "$ref": "#/texts/7"
},
{
- "$ref": "#/groups/16"
+ "$ref": "#/texts/8"
},
{
- "$ref": "#/groups/18"
+ "$ref": "#/texts/9"
},
{
- "$ref": "#/groups/20"
+ "$ref": "#/texts/10"
},
{
- "$ref": "#/groups/22"
+ "$ref": "#/texts/11"
},
{
- "$ref": "#/groups/24"
+ "$ref": "#/texts/12"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
- "groups": [
- {
- "self_ref": "#/groups/0",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
- {
- "$ref": "#/texts/0"
- },
- {
- "$ref": "#/groups/1"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [
- {
- "$ref": "#/texts/1"
- },
- {
- "$ref": "#/texts/2"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
+ "groups": [],
+ "texts": [
{
- "self_ref": "#/groups/2",
+ "self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/3"
- },
- {
- "$ref": "#/groups/3"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [
- {
- "$ref": "#/texts/4"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/5"
+ "start_time": 11.0,
+ "end_time": 13.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "We are in New York City",
+ "text": "We are in New York City"
},
{
- "self_ref": "#/groups/4",
+ "self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/6"
- },
- {
- "$ref": "#/groups/5"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/5",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [
- {
- "$ref": "#/texts/7"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/8"
+ "start_time": 13.0,
+ "end_time": 16.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "We’re actually at the Lucern Hotel, just down the street",
+ "text": "We’re actually at the Lucern Hotel, just down the street"
},
{
- "self_ref": "#/groups/6",
+ "self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/9"
- },
- {
- "$ref": "#/groups/7"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/7",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [
- {
- "$ref": "#/texts/10"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/11"
+ "start_time": 16.0,
+ "end_time": 18.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "from the American Museum of Natural History",
+ "text": "from the American Museum of Natural History"
},
{
- "self_ref": "#/groups/8",
+ "self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/12"
- },
- {
- "$ref": "#/groups/9"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/9",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [
- {
- "$ref": "#/texts/13"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/14"
+ "start_time": 18.0,
+ "end_time": 20.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "And with me is Neil deGrasse Tyson",
+ "text": "And with me is Neil deGrasse Tyson"
},
{
- "self_ref": "#/groups/10",
+ "self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/15"
- },
- {
- "$ref": "#/groups/11"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/11",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [
- {
- "$ref": "#/texts/16"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/17"
+ "start_time": 20.0,
+ "end_time": 22.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Astrophysicist, Director of the Hayden Planetarium",
+ "text": "Astrophysicist, Director of the Hayden Planetarium"
},
{
- "self_ref": "#/groups/12",
+ "self_ref": "#/texts/5",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/18"
- },
- {
- "$ref": "#/groups/13"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/13",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [
- {
- "$ref": "#/texts/19"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/20"
+ "start_time": 22.0,
+ "end_time": 24.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "at the AMNH.",
+ "text": "at the AMNH."
},
{
- "self_ref": "#/groups/14",
+ "self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/21"
- },
- {
- "$ref": "#/groups/15"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/15",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [
- {
- "$ref": "#/texts/22"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/23"
+ "start_time": 24.0,
+ "end_time": 26.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Thank you for walking down here.",
+ "text": "Thank you for walking down here."
},
{
- "self_ref": "#/groups/16",
+ "self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/24"
- },
- {
- "$ref": "#/groups/17"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/17",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [
- {
- "$ref": "#/texts/25"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/26"
+ "start_time": 27.0,
+ "end_time": 30.0,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "And I want to do a follow-up on the last conversation we did.",
+ "text": "And I want to do a follow-up on the last conversation we did."
},
{
- "self_ref": "#/groups/18",
+ "self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/27"
- },
- {
- "$ref": "#/groups/19"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/19",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [
- {
- "$ref": "#/texts/28"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/29"
+ "start_time": 30.0,
+ "end_time": 31.5,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "When we e-mailed—",
+ "text": "When we e-mailed—"
},
{
- "self_ref": "#/groups/20",
+ "self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/30"
- },
- {
- "$ref": "#/groups/21"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/21",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [
- {
- "$ref": "#/texts/31"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/32"
+ "start_time": 30.5,
+ "end_time": 32.5,
+ "voice": "Neil deGrasse Tyson"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Didn’t we talk about enough in that conversation?",
+ "text": "Didn’t we talk about enough in that conversation?"
},
{
- "self_ref": "#/groups/22",
+ "self_ref": "#/texts/10",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/33"
- },
- {
- "$ref": "#/groups/23"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/23",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [
- {
- "$ref": "#/texts/34"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/35"
+ "start_time": 32.0,
+ "end_time": 35.5,
+ "voice": "Roger Bingham"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+ "text": "No! No no no no; 'cos 'cos obviously 'cos"
},
{
- "self_ref": "#/groups/24",
+ "self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/36"
- },
- {
- "$ref": "#/groups/25"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/25",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [
- {
- "$ref": "#/texts/37"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/38"
+ "start_time": 32.5,
+ "end_time": 33.5,
+ "voice": "Neil deGrasse Tyson"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- }
- ],
- "texts": [
- {
- "self_ref": "#/texts/0",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:11.000 --> 00:13.000",
- "text": "00:11.000 --> 00:13.000"
- },
- {
- "self_ref": "#/texts/1",
- "parent": {
- "$ref": "#/groups/1"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/2",
- "parent": {
- "$ref": "#/groups/1"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "We are in New York City",
- "text": "We are in New York City",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:13.000 --> 00:16.000",
- "text": "00:13.000 --> 00:16.000"
- },
- {
- "self_ref": "#/texts/4",
- "parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/5",
- "parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "We’re actually at the Lucern Hotel, just down the street",
- "text": "We’re actually at the Lucern Hotel, just down the street",
+ "orig": "Laughs",
+ "text": "Laughs",
"formatting": {
"bold": false,
- "italic": false,
+ "italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
- "self_ref": "#/texts/6",
+ "self_ref": "#/texts/12",
"parent": {
- "$ref": "#/groups/4"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "00:16.000 --> 00:18.000",
- "text": "00:16.000 --> 00:18.000"
- },
- {
- "self_ref": "#/texts/7",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/8",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "from the American Museum of Natural History",
- "text": "from the American Museum of Natural History",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/9",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:18.000 --> 00:20.000",
- "text": "00:18.000 --> 00:20.000"
- },
- {
- "self_ref": "#/texts/10",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/11",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "And with me is Neil deGrasse Tyson",
- "text": "And with me is Neil deGrasse Tyson",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/12",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:20.000 --> 00:22.000",
- "text": "00:20.000 --> 00:22.000"
- },
- {
- "self_ref": "#/texts/13",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/14",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Astrophysicist, Director of the Hayden Planetarium",
- "text": "Astrophysicist, Director of the Hayden Planetarium",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/15",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:22.000 --> 00:24.000",
- "text": "00:22.000 --> 00:24.000"
- },
- {
- "self_ref": "#/texts/16",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/17",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "at the AMNH.",
- "text": "at the AMNH.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/18",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:24.000 --> 00:26.000",
- "text": "00:24.000 --> 00:26.000"
- },
- {
- "self_ref": "#/texts/19",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/20",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Thank you for walking down here.",
- "text": "Thank you for walking down here.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/21",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:27.000 --> 00:30.000",
- "text": "00:27.000 --> 00:30.000"
- },
- {
- "self_ref": "#/texts/22",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/23",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "And I want to do a follow-up on the last conversation we did.",
- "text": "And I want to do a follow-up on the last conversation we did.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/24",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:30.000 --> 00:31.500",
- "text": "00:30.000 --> 00:31.500"
- },
- {
- "self_ref": "#/texts/25",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/26",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "When we e-mailed—",
- "text": "When we e-mailed—",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/27",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:30.500 --> 00:32.500",
- "text": "00:30.500 --> 00:32.500"
- },
- {
- "self_ref": "#/texts/28",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Neil deGrasse Tyson: ",
- "text": "Neil deGrasse Tyson: "
- },
- {
- "self_ref": "#/texts/29",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Didn’t we talk about enough in that conversation?",
- "text": "Didn’t we talk about enough in that conversation?",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/30",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:32.000 --> 00:35.500",
- "text": "00:32.000 --> 00:35.500"
- },
- {
- "self_ref": "#/texts/31",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/32",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "No! No no no no; 'cos 'cos obviously 'cos",
- "text": "No! No no no no; 'cos 'cos obviously 'cos",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/33",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:32.500 --> 00:33.500",
- "text": "00:32.500 --> 00:33.500"
- },
- {
- "self_ref": "#/texts/34",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Neil deGrasse Tyson: ",
- "text": "Neil deGrasse Tyson: "
- },
- {
- "self_ref": "#/texts/35",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Laughs",
- "text": "Laughs",
- "formatting": {
- "bold": false,
- "italic": true,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/36",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:35.500 --> 00:38.000",
- "text": "00:35.500 --> 00:38.000"
- },
- {
- "self_ref": "#/texts/37",
- "parent": {
- "$ref": "#/groups/25"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Roger Bingham: ",
- "text": "Roger Bingham: "
- },
- {
- "self_ref": "#/texts/38",
- "parent": {
- "$ref": "#/groups/25"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
+ "prov": [
+ {
+ "start_time": 35.5,
+ "end_time": 38.0,
+ "voice": "Roger Bingham"
+ }
+ ],
"orig": "You know I’m so excited my glasses are falling off here.",
- "text": "You know I’m so excited my glasses are falling off here.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "text": "You know I’m so excited my glasses are falling off here."
}
],
"pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
index c57670289..95d9e6575 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -1,51 +1,25 @@
-00:11.000 --> 00:13.000
+We are in New York City
-Roger Bingham: We are in New York City
+We’re actually at the Lucern Hotel, just down the street
-00:13.000 --> 00:16.000
+from the American Museum of Natural History
-Roger Bingham: We’re actually at the Lucern Hotel, just down the street
+And with me is Neil deGrasse Tyson
-00:16.000 --> 00:18.000
+Astrophysicist, Director of the Hayden Planetarium
-Roger Bingham: from the American Museum of Natural History
+at the AMNH.
-00:18.000 --> 00:20.000
+Thank you for walking down here.
-Roger Bingham: And with me is Neil deGrasse Tyson
+And I want to do a follow-up on the last conversation we did.
-00:20.000 --> 00:22.000
+When we e-mailed—
-Roger Bingham: Astrophysicist, Director of the Hayden Planetarium
+Didn’t we talk about enough in that conversation?
-00:22.000 --> 00:24.000
+No! No no no no; 'cos 'cos obviously 'cos
-Roger Bingham: at the AMNH.
+*Laughs*
-00:24.000 --> 00:26.000
-
-Roger Bingham: Thank you for walking down here.
-
-00:27.000 --> 00:30.000
-
-Roger Bingham: And I want to do a follow-up on the last conversation we did.
-
-00:30.000 --> 00:31.500
-
-Roger Bingham: When we e-mailed—
-
-00:30.500 --> 00:32.500
-
-Neil deGrasse Tyson: Didn’t we talk about enough in that conversation?
-
-00:32.000 --> 00:35.500
-
-Roger Bingham: No! No no no no; 'cos 'cos obviously 'cos
-
-00:32.500 --> 00:33.500
-
-Neil deGrasse Tyson: *Laughs*
-
-00:35.500 --> 00:38.000
-
-Roger Bingham: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
index 6d90404ff..56f63bc3f 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -1,22 +1,12 @@
item-0 at level 0: unspecified: group _root_
- item-1 at level 1: section: group WebVTT cue block
- item-2 at level 2: text: 00:00.000 --> 00:02.000
- item-3 at level 2: inline: group WebVTT cue voice span
- item-4 at level 3: text: Esme (first, loud):
- item-5 at level 3: text: It’s a blue apple tree!
- item-6 at level 1: section: group WebVTT cue block
- item-7 at level 2: text: 00:02.000 --> 00:04.000
- item-8 at level 2: inline: group WebVTT cue voice span
- item-9 at level 3: text: Mary:
- item-10 at level 3: text: No way!
- item-11 at level 1: section: group WebVTT cue block
- item-12 at level 2: text: 00:04.000 --> 00:06.000
- item-13 at level 2: inline: group WebVTT cue voice span
- item-14 at level 3: text: Esme:
- item-15 at level 3: text: Hee!
- item-16 at level 2: text: laughter
- item-17 at level 1: section: group WebVTT cue block
- item-18 at level 2: text: 00:06.000 --> 00:08.000
- item-19 at level 2: inline: group WebVTT cue voice span
- item-20 at level 3: text: Mary (loud):
- item-21 at level 3: text: That’s awesome!
\ No newline at end of file
+ item-1 at level 1: text: It’s a blue apple tree!
+ item-2 at level 1: text: No way!
+ item-3 at level 1: inline: group WebVTT cue span
+ item-4 at level 2: text: Hee!
+ item-5 at level 2: text:
+ item-6 at level 2: text: laughter
+ item-7 at level 1: text: That’s awesome!
+ item-8 at level 1: inline: group WebVTT cue span
+ item-9 at level 2: text: Sur les
+ item-10 at level 2: text: playground
+ item-11 at level 2: text: , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
index 72647d93d..67a95ef50 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -1,10 +1,10 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "webvtt_example_02",
"origin": {
"mimetype": "text/vtt",
- "binary_hash": 5029965721282070624,
+ "binary_hash": 8584853280299071027,
"filename": "webvtt_example_02.vtt"
},
"furniture": {
@@ -18,16 +18,19 @@
"self_ref": "#/body",
"children": [
{
- "$ref": "#/groups/0"
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/texts/1"
},
{
- "$ref": "#/groups/2"
+ "$ref": "#/groups/0"
},
{
- "$ref": "#/groups/4"
+ "$ref": "#/texts/5"
},
{
- "$ref": "#/groups/6"
+ "$ref": "#/groups/1"
}
],
"content_layer": "body",
@@ -41,70 +44,22 @@
"$ref": "#/body"
},
"children": [
- {
- "$ref": "#/texts/0"
- },
- {
- "$ref": "#/groups/1"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [
- {
- "$ref": "#/texts/1"
- },
{
"$ref": "#/texts/2"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
- {
- "self_ref": "#/groups/2",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
+ },
{
"$ref": "#/texts/3"
},
- {
- "$ref": "#/groups/3"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [
{
"$ref": "#/texts/4"
- },
- {
- "$ref": "#/texts/5"
}
],
"content_layer": "body",
- "name": "WebVTT cue voice span",
+ "name": "WebVTT cue span",
"label": "inline"
},
{
- "self_ref": "#/groups/4",
+ "self_ref": "#/groups/1",
"parent": {
"$ref": "#/body"
},
@@ -112,23 +67,6 @@
{
"$ref": "#/texts/6"
},
- {
- "$ref": "#/groups/5"
- },
- {
- "$ref": "#/texts/9"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/5",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [
{
"$ref": "#/texts/7"
},
@@ -137,41 +75,7 @@
}
],
"content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- },
- {
- "self_ref": "#/groups/6",
- "parent": {
- "$ref": "#/body"
- },
- "children": [
- {
- "$ref": "#/texts/10"
- },
- {
- "$ref": "#/groups/7"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/7",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [
- {
- "$ref": "#/texts/11"
- },
- {
- "$ref": "#/texts/12"
- }
- ],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
+ "name": "WebVTT cue span",
"label": "inline"
}
],
@@ -179,143 +83,161 @@
{
"self_ref": "#/texts/0",
"parent": {
- "$ref": "#/groups/0"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "00:00.000 --> 00:02.000",
- "text": "00:00.000 --> 00:02.000"
+ "prov": [
+ {
+ "start_time": 0.0,
+ "end_time": 2.0,
+ "voice": "Esme",
+ "classes": [
+ "v.first.loud"
+ ]
+ }
+ ],
+ "orig": "It’s a blue apple tree!",
+ "text": "It’s a blue apple tree!"
},
{
"self_ref": "#/texts/1",
"parent": {
- "$ref": "#/groups/1"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "Esme (first, loud): ",
- "text": "Esme (first, loud): "
+ "prov": [
+ {
+ "start_time": 2.0,
+ "end_time": 4.0,
+ "voice": "Mary"
+ }
+ ],
+ "orig": "No way!",
+ "text": "No way!"
},
{
"self_ref": "#/texts/2",
"parent": {
- "$ref": "#/groups/1"
+ "$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "It’s a blue apple tree!",
- "text": "It’s a blue apple tree!",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "prov": [
+ {
+ "start_time": 4.0,
+ "end_time": 6.0,
+ "voice": "Esme"
+ }
+ ],
+ "orig": "Hee!",
+ "text": "Hee!"
},
{
"self_ref": "#/texts/3",
"parent": {
- "$ref": "#/groups/2"
+ "$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "00:02.000 --> 00:04.000",
- "text": "00:02.000 --> 00:04.000"
+ "prov": [
+ {
+ "start_time": 4.0,
+ "end_time": 6.0
+ }
+ ],
+ "orig": " ",
+ "text": " "
},
{
"self_ref": "#/texts/4",
"parent": {
- "$ref": "#/groups/3"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Mary: ",
- "text": "Mary: "
- },
- {
- "self_ref": "#/texts/5",
- "parent": {
- "$ref": "#/groups/3"
+ "$ref": "#/groups/0"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "No way!",
- "text": "No way!",
+ "prov": [
+ {
+ "start_time": 4.0,
+ "end_time": 6.0
+ }
+ ],
+ "orig": "laughter",
+ "text": "laughter",
"formatting": {
"bold": false,
- "italic": false,
+ "italic": true,
"underline": false,
"strikethrough": false,
"script": "baseline"
}
},
{
- "self_ref": "#/texts/6",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:04.000 --> 00:06.000",
- "text": "00:04.000 --> 00:06.000"
- },
- {
- "self_ref": "#/texts/7",
+ "self_ref": "#/texts/5",
"parent": {
- "$ref": "#/groups/5"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "Esme: ",
- "text": "Esme: "
+ "prov": [
+ {
+ "start_time": 6.0,
+ "end_time": 8.0,
+ "voice": "Mary",
+ "classes": [
+ "v.loud"
+ ]
+ }
+ ],
+ "orig": "That’s awesome!",
+ "text": "That’s awesome!"
},
{
- "self_ref": "#/texts/8",
+ "self_ref": "#/texts/6",
"parent": {
- "$ref": "#/groups/5"
+ "$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "Hee!",
- "text": "Hee!",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "prov": [
+ {
+ "start_time": 8.0,
+ "end_time": 10.0
+ }
+ ],
+ "orig": "Sur les ",
+ "text": "Sur les "
},
{
- "self_ref": "#/texts/9",
+ "self_ref": "#/texts/7",
"parent": {
- "$ref": "#/groups/4"
+ "$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "laughter",
- "text": "laughter",
+ "prov": [
+ {
+ "start_time": 8.0,
+ "end_time": 10.0,
+ "languages": [
+ "en"
+ ],
+ "classes": [
+ "i.foreignphrase"
+ ]
+ }
+ ],
+ "orig": "playground",
+ "text": "playground",
"formatting": {
"bold": false,
"italic": true,
@@ -325,47 +247,21 @@
}
},
{
- "self_ref": "#/texts/10",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:06.000 --> 00:08.000",
- "text": "00:06.000 --> 00:08.000"
- },
- {
- "self_ref": "#/texts/11",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Mary (loud): ",
- "text": "Mary (loud): "
- },
- {
- "self_ref": "#/texts/12",
+ "self_ref": "#/texts/8",
"parent": {
- "$ref": "#/groups/7"
+ "$ref": "#/groups/1"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "That’s awesome!",
- "text": "That’s awesome!",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "prov": [
+ {
+ "start_time": 8.0,
+ "end_time": 10.0
+ }
+ ],
+ "orig": ", ici à Montpellier",
+ "text": ", ici à Montpellier"
}
],
"pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
index db84cf116..5c6485f3a 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -1,17 +1,9 @@
-00:00.000 --> 00:02.000
+It’s a blue apple tree!
-Esme (first, loud): It’s a blue apple tree!
+No way!
-00:02.000 --> 00:04.000
+Hee! *laughter*
-Mary: No way!
+That’s awesome!
-00:04.000 --> 00:06.000
-
-Esme: Hee!
-
-*laughter*
-
-00:06.000 --> 00:08.000
-
-Mary (loud): That’s awesome!
\ No newline at end of file
+Sur les *playground* , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
index ca344e595..a46794123 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -1,77 +1,18 @@
item-0 at level 0: unspecified: group _root_
- item-1 at level 1: section: group WebVTT cue block
- item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
- item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
- item-4 at level 2: inline: group WebVTT cue voice span
- item-5 at level 3: text: Speaker A:
- item-6 at level 3: text: OK, I think now we should be recording
- item-7 at level 1: section: group WebVTT cue block
- item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
- item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
- item-10 at level 2: inline: group WebVTT cue voice span
- item-11 at level 3: text: Speaker A:
- item-12 at level 3: text: properly.
- item-13 at level 1: section: group WebVTT cue block
- item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
- item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
- item-16 at level 2: text: Good.
- item-17 at level 1: section: group WebVTT cue block
- item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
- item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
- item-20 at level 2: inline: group WebVTT cue voice span
- item-21 at level 3: text: Speaker A:
- item-22 at level 3: text: Yeah.
- item-23 at level 1: section: group WebVTT cue block
- item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
- item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
- item-26 at level 2: inline: group WebVTT cue voice span
- item-27 at level 3: text: Speaker B:
- item-28 at level 3: text: I was also thinking.
- item-29 at level 1: section: group WebVTT cue block
- item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
- item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
- item-32 at level 2: inline: group WebVTT cue voice span
- item-33 at level 3: text: Speaker B:
- item-34 at level 3: text: Would be maybe good to create items,
- item-35 at level 1: section: group WebVTT cue block
- item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
- item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
- item-38 at level 2: inline: group WebVTT cue voice span
- item-39 at level 3: text: Speaker B:
- item-40 at level 3: text: some metadata, some options that can be specific.
- item-41 at level 1: section: group WebVTT cue block
- item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
- item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
- item-44 at level 2: inline: group WebVTT cue voice span
- item-45 at level 3: text: Speaker A:
- item-46 at level 3: text: Yeah, I mean I think you went even more than
- item-47 at level 1: section: group WebVTT cue block
- item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
- item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
- item-50 at level 2: inline: group WebVTT cue voice span
- item-51 at level 3: text: Speaker B:
- item-52 at level 3: text: But we preserved the atoms.
- item-53 at level 1: section: group WebVTT cue block
- item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
- item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
- item-56 at level 2: inline: group WebVTT cue voice span
- item-57 at level 3: text: Speaker A:
- item-58 at level 3: text: than me. I just opened the format.
- item-59 at level 1: section: group WebVTT cue block
- item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
- item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
- item-62 at level 2: inline: group WebVTT cue voice span
- item-63 at level 3: text: Speaker A:
- item-64 at level 3: text: give it a try, yeah.
- item-65 at level 1: section: group WebVTT cue block
- item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
- item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
- item-68 at level 2: inline: group WebVTT cue voice span
- item-69 at level 3: text: Speaker B:
- item-70 at level 3: text: Okay, talk to you later.
- item-71 at level 1: section: group WebVTT cue block
- item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
- item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
- item-74 at level 2: inline: group WebVTT cue voice span
- item-75 at level 3: text: Speaker A:
- item-76 at level 3: text: See you.
\ No newline at end of file
+ item-1 at level 1: text: OK,
+ item-2 at level 1: text: I think now we should be recording
+ item-3 at level 1: text: properly.
+ item-4 at level 1: text: Good.
+ item-5 at level 1: text: Yeah.
+ item-6 at level 1: text: I was also thinking.
+ item-7 at level 1: text: Would be maybe good to create items,
+ item-8 at level 1: text: some metadata,
+ item-9 at level 1: text: some options that can be specific.
+ item-10 at level 1: text: Yeah,
+ item-11 at level 1: text: I mean I think you went even more than
+ item-12 at level 1: text: But we preserved the atoms.
+ item-13 at level 1: text: than me.
+ item-14 at level 1: text: I just opened the format.
+ item-15 at level 1: text: give it a try, yeah.
+ item-16 at level 1: text: Okay, talk to you later.
+ item-17 at level 1: text: See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
index 5df08e2bf..dddce0f28 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
@@ -1,6 +1,6 @@
{
"schema_name": "DoclingDocument",
- "version": "1.7.0",
+ "version": "1.8.0",
"name": "webvtt_example_03",
"origin": {
"mimetype": "text/vtt",
@@ -18,1218 +18,384 @@
"self_ref": "#/body",
"children": [
{
- "$ref": "#/groups/0"
+ "$ref": "#/texts/0"
},
{
- "$ref": "#/groups/2"
+ "$ref": "#/texts/1"
},
{
- "$ref": "#/groups/4"
+ "$ref": "#/texts/2"
},
{
- "$ref": "#/groups/5"
+ "$ref": "#/texts/3"
},
{
- "$ref": "#/groups/7"
+ "$ref": "#/texts/4"
},
{
- "$ref": "#/groups/9"
+ "$ref": "#/texts/5"
},
{
- "$ref": "#/groups/11"
+ "$ref": "#/texts/6"
},
{
- "$ref": "#/groups/13"
+ "$ref": "#/texts/7"
},
{
- "$ref": "#/groups/15"
+ "$ref": "#/texts/8"
},
{
- "$ref": "#/groups/17"
+ "$ref": "#/texts/9"
},
{
- "$ref": "#/groups/19"
+ "$ref": "#/texts/10"
},
{
- "$ref": "#/groups/21"
+ "$ref": "#/texts/11"
},
{
- "$ref": "#/groups/23"
+ "$ref": "#/texts/12"
+ },
+ {
+ "$ref": "#/texts/13"
+ },
+ {
+ "$ref": "#/texts/14"
+ },
+ {
+ "$ref": "#/texts/15"
+ },
+ {
+ "$ref": "#/texts/16"
}
],
"content_layer": "body",
"name": "_root_",
"label": "unspecified"
},
- "groups": [
+ "groups": [],
+ "texts": [
{
- "self_ref": "#/groups/0",
+ "self_ref": "#/texts/0",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/0"
- },
- {
- "$ref": "#/texts/1"
- },
- {
- "$ref": "#/groups/1"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [
- {
- "$ref": "#/texts/2"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/3"
+ "start_time": 4.963,
+ "end_time": 8.571,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "OK,",
+ "text": "OK,"
},
{
- "self_ref": "#/groups/2",
+ "self_ref": "#/texts/1",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/4"
- },
- {
- "$ref": "#/texts/5"
- },
- {
- "$ref": "#/groups/3"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/3",
- "parent": {
- "$ref": "#/groups/2"
- },
- "children": [
- {
- "$ref": "#/texts/6"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/7"
+ "start_time": 4.963,
+ "end_time": 8.571,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "I think now we should be recording",
+ "text": "I think now we should be recording"
},
{
- "self_ref": "#/groups/4",
+ "self_ref": "#/texts/2",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/8"
- },
- {
- "$ref": "#/texts/9"
- },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/10"
+ "start_time": 8.571,
+ "end_time": 9.403,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
+ "orig": "properly.",
+ "text": "properly."
},
{
- "self_ref": "#/groups/5",
+ "self_ref": "#/texts/3",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/11"
- },
- {
- "$ref": "#/texts/12"
- },
- {
- "$ref": "#/groups/6"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/6",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [
- {
- "$ref": "#/texts/13"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/14"
+ "start_time": 10.683,
+ "end_time": 11.563,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Good.",
+ "text": "Good."
},
{
- "self_ref": "#/groups/7",
+ "self_ref": "#/texts/4",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/15"
- },
- {
- "$ref": "#/texts/16"
- },
- {
- "$ref": "#/groups/8"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/8",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [
- {
- "$ref": "#/texts/17"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/18"
+ "start_time": 13.363,
+ "end_time": 13.803,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Yeah.",
+ "text": "Yeah."
},
{
- "self_ref": "#/groups/9",
+ "self_ref": "#/texts/5",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/19"
- },
- {
- "$ref": "#/texts/20"
- },
- {
- "$ref": "#/groups/10"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/10",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [
- {
- "$ref": "#/texts/21"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/22"
+ "start_time": 49.603,
+ "end_time": 53.363,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "I was also thinking.",
+ "text": "I was also thinking."
},
{
- "self_ref": "#/groups/11",
+ "self_ref": "#/texts/6",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/23"
- },
- {
- "$ref": "#/texts/24"
- },
- {
- "$ref": "#/groups/12"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/12",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [
- {
- "$ref": "#/texts/25"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/26"
+ "start_time": 54.963,
+ "end_time": 62.072,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Would be maybe good to create items,",
+ "text": "Would be maybe good to create items,"
},
{
- "self_ref": "#/groups/13",
+ "self_ref": "#/texts/7",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/27"
- },
- {
- "$ref": "#/texts/28"
- },
- {
- "$ref": "#/groups/14"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/14",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [
- {
- "$ref": "#/texts/29"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/30"
+ "start_time": 62.072,
+ "end_time": 66.811,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "some metadata,",
+ "text": "some metadata,"
},
{
- "self_ref": "#/groups/15",
+ "self_ref": "#/texts/8",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/31"
- },
- {
- "$ref": "#/texts/32"
- },
- {
- "$ref": "#/groups/16"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/16",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [
- {
- "$ref": "#/texts/33"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/34"
+ "start_time": 62.072,
+ "end_time": 66.811,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "some options that can be specific.",
+ "text": "some options that can be specific."
},
{
- "self_ref": "#/groups/17",
+ "self_ref": "#/texts/9",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/35"
- },
- {
- "$ref": "#/texts/36"
- },
- {
- "$ref": "#/groups/18"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/18",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [
- {
- "$ref": "#/texts/37"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/38"
+ "start_time": 70.243,
+ "end_time": 73.014,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "Yeah,",
+ "text": "Yeah,"
},
{
- "self_ref": "#/groups/19",
+ "self_ref": "#/texts/10",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/39"
- },
- {
- "$ref": "#/texts/40"
- },
- {
- "$ref": "#/groups/20"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/20",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [
- {
- "$ref": "#/texts/41"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/42"
+ "start_time": 70.243,
+ "end_time": 73.014,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "I mean I think you went even more than",
+ "text": "I mean I think you went even more than"
},
{
- "self_ref": "#/groups/21",
+ "self_ref": "#/texts/11",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/43"
- },
- {
- "$ref": "#/texts/44"
- },
- {
- "$ref": "#/groups/22"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/22",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [
- {
- "$ref": "#/texts/45"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/46"
+ "start_time": 70.563,
+ "end_time": 72.643,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+ "voice": "Speaker B"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
+ "orig": "But we preserved the atoms.",
+ "text": "But we preserved the atoms."
},
{
- "self_ref": "#/groups/23",
+ "self_ref": "#/texts/12",
"parent": {
"$ref": "#/body"
},
- "children": [
- {
- "$ref": "#/texts/47"
- },
- {
- "$ref": "#/texts/48"
- },
- {
- "$ref": "#/groups/24"
- }
- ],
+ "children": [],
"content_layer": "body",
- "name": "WebVTT cue block",
- "label": "section"
- },
- {
- "self_ref": "#/groups/24",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [
- {
- "$ref": "#/texts/49"
- },
+ "label": "text",
+ "prov": [
{
- "$ref": "#/texts/50"
+ "start_time": 73.014,
+ "end_time": 75.907,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+ "voice": "Speaker A"
}
],
- "content_layer": "body",
- "name": "WebVTT cue voice span",
- "label": "inline"
- }
- ],
- "texts": [
- {
- "self_ref": "#/texts/0",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
- },
- {
- "self_ref": "#/texts/1",
- "parent": {
- "$ref": "#/groups/0"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:04.963 --> 00:00:08.571",
- "text": "00:00:04.963 --> 00:00:08.571"
- },
- {
- "self_ref": "#/texts/2",
- "parent": {
- "$ref": "#/groups/1"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/3",
- "parent": {
- "$ref": "#/groups/1"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "OK, I think now we should be recording",
- "text": "OK, I think now we should be recording",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "orig": "than me.",
+ "text": "than me."
},
{
- "self_ref": "#/texts/4",
+ "self_ref": "#/texts/13",
"parent": {
- "$ref": "#/groups/2"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1"
+ "prov": [
+ {
+ "start_time": 73.014,
+ "end_time": 75.907,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+ "voice": "Speaker A"
+ }
+ ],
+ "orig": "I just opened the format.",
+ "text": "I just opened the format."
},
{
- "self_ref": "#/texts/5",
+ "self_ref": "#/texts/14",
"parent": {
- "$ref": "#/groups/2"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "00:00:08.571 --> 00:00:09.403",
- "text": "00:00:08.571 --> 00:00:09.403"
+ "prov": [
+ {
+ "start_time": 110.222,
+ "end_time": 111.643,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+ "voice": "Speaker A"
+ }
+ ],
+ "orig": "give it a try, yeah.",
+ "text": "give it a try, yeah."
},
{
- "self_ref": "#/texts/6",
+ "self_ref": "#/texts/15",
"parent": {
- "$ref": "#/groups/3"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
+ "prov": [
+ {
+ "start_time": 112.043,
+ "end_time": 115.043,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+ "voice": "Speaker B"
+ }
+ ],
+ "orig": "Okay, talk to you later.",
+ "text": "Okay, talk to you later."
},
{
- "self_ref": "#/texts/7",
+ "self_ref": "#/texts/16",
"parent": {
- "$ref": "#/groups/3"
+ "$ref": "#/body"
},
"children": [],
"content_layer": "body",
"label": "text",
- "prov": [],
- "orig": "properly.",
- "text": "properly.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/8",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
- },
- {
- "self_ref": "#/texts/9",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:10.683 --> 00:00:11.563",
- "text": "00:00:10.683 --> 00:00:11.563"
- },
- {
- "self_ref": "#/texts/10",
- "parent": {
- "$ref": "#/groups/4"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Good.",
- "text": "Good.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/11",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0"
- },
- {
- "self_ref": "#/texts/12",
- "parent": {
- "$ref": "#/groups/5"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:13.363 --> 00:00:13.803",
- "text": "00:00:13.363 --> 00:00:13.803"
- },
- {
- "self_ref": "#/texts/13",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/14",
- "parent": {
- "$ref": "#/groups/6"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Yeah.",
- "text": "Yeah.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/15",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0"
- },
- {
- "self_ref": "#/texts/16",
- "parent": {
- "$ref": "#/groups/7"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:49.603 --> 00:00:53.363",
- "text": "00:00:49.603 --> 00:00:53.363"
- },
- {
- "self_ref": "#/texts/17",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/18",
- "parent": {
- "$ref": "#/groups/8"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "I was also thinking.",
- "text": "I was also thinking.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/19",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0"
- },
- {
- "self_ref": "#/texts/20",
- "parent": {
- "$ref": "#/groups/9"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:00:54.963 --> 00:01:02.072",
- "text": "00:00:54.963 --> 00:01:02.072"
- },
- {
- "self_ref": "#/texts/21",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/22",
- "parent": {
- "$ref": "#/groups/10"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Would be maybe good to create items,",
- "text": "Would be maybe good to create items,",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/23",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1"
- },
- {
- "self_ref": "#/texts/24",
- "parent": {
- "$ref": "#/groups/11"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:02.072 --> 00:01:06.811",
- "text": "00:01:02.072 --> 00:01:06.811"
- },
- {
- "self_ref": "#/texts/25",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/26",
- "parent": {
- "$ref": "#/groups/12"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "some metadata, some options that can be specific.",
- "text": "some metadata, some options that can be specific.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/27",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0"
- },
- {
- "self_ref": "#/texts/28",
- "parent": {
- "$ref": "#/groups/13"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:10.243 --> 00:01:13.014",
- "text": "00:01:10.243 --> 00:01:13.014"
- },
- {
- "self_ref": "#/texts/29",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/30",
- "parent": {
- "$ref": "#/groups/14"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Yeah, I mean I think you went even more than",
- "text": "Yeah, I mean I think you went even more than",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/31",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0"
- },
- {
- "self_ref": "#/texts/32",
- "parent": {
- "$ref": "#/groups/15"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:10.563 --> 00:01:12.643",
- "text": "00:01:10.563 --> 00:01:12.643"
- },
- {
- "self_ref": "#/texts/33",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/34",
- "parent": {
- "$ref": "#/groups/16"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "But we preserved the atoms.",
- "text": "But we preserved the atoms.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/35",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1"
- },
- {
- "self_ref": "#/texts/36",
- "parent": {
- "$ref": "#/groups/17"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:13.014 --> 00:01:15.907",
- "text": "00:01:13.014 --> 00:01:15.907"
- },
- {
- "self_ref": "#/texts/37",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/38",
- "parent": {
- "$ref": "#/groups/18"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "than me. I just opened the format.",
- "text": "than me. I just opened the format.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/39",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1"
- },
- {
- "self_ref": "#/texts/40",
- "parent": {
- "$ref": "#/groups/19"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:50.222 --> 00:01:51.643",
- "text": "00:01:50.222 --> 00:01:51.643"
- },
- {
- "self_ref": "#/texts/41",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/42",
- "parent": {
- "$ref": "#/groups/20"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "give it a try, yeah.",
- "text": "give it a try, yeah.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/43",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0"
- },
- {
- "self_ref": "#/texts/44",
- "parent": {
- "$ref": "#/groups/21"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:52.043 --> 00:01:55.043",
- "text": "00:01:52.043 --> 00:01:55.043"
- },
- {
- "self_ref": "#/texts/45",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker B: ",
- "text": "Speaker B: "
- },
- {
- "self_ref": "#/texts/46",
- "parent": {
- "$ref": "#/groups/22"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Okay, talk to you later.",
- "text": "Okay, talk to you later.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
- },
- {
- "self_ref": "#/texts/47",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
- "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0"
- },
- {
- "self_ref": "#/texts/48",
- "parent": {
- "$ref": "#/groups/23"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "00:01:54.603 --> 00:01:55.283",
- "text": "00:01:54.603 --> 00:01:55.283"
- },
- {
- "self_ref": "#/texts/49",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
- "orig": "Speaker A: ",
- "text": "Speaker A: "
- },
- {
- "self_ref": "#/texts/50",
- "parent": {
- "$ref": "#/groups/24"
- },
- "children": [],
- "content_layer": "body",
- "label": "text",
- "prov": [],
+ "prov": [
+ {
+ "start_time": 114.603,
+ "end_time": 115.283,
+ "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+ "voice": "Speaker A"
+ }
+ ],
"orig": "See you.",
- "text": "See you.",
- "formatting": {
- "bold": false,
- "italic": false,
- "underline": false,
- "strikethrough": false,
- "script": "baseline"
- }
+ "text": "See you."
}
],
"pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
index 859a6dde3..b58d350b3 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -1,77 +1,33 @@
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+OK,
-00:00:04.963 --> 00:00:08.571
+I think now we should be recording
-Speaker A: OK, I think now we should be recording
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
-
-00:00:08.571 --> 00:00:09.403
-
-Speaker A: properly.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
-
-00:00:10.683 --> 00:00:11.563
+properly.
Good.
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
-
-00:00:13.363 --> 00:00:13.803
-
-Speaker A: Yeah.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
-
-00:00:49.603 --> 00:00:53.363
-
-Speaker B: I was also thinking.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
-
-00:00:54.963 --> 00:01:02.072
-
-Speaker B: Would be maybe good to create items,
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
-
-00:01:02.072 --> 00:01:06.811
-
-Speaker B: some metadata, some options that can be specific.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
-
-00:01:10.243 --> 00:01:13.014
-
-Speaker A: Yeah, I mean I think you went even more than
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
-
-00:01:10.563 --> 00:01:12.643
-
-Speaker B: But we preserved the atoms.
+Yeah.
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+I was also thinking.
-00:01:13.014 --> 00:01:15.907
+Would be maybe good to create items,
-Speaker A: than me. I just opened the format.
+some metadata,
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+some options that can be specific.
-00:01:50.222 --> 00:01:51.643
+Yeah,
-Speaker A: give it a try, yeah.
+I mean I think you went even more than
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+But we preserved the atoms.
-00:01:52.043 --> 00:01:55.043
+than me.
-Speaker B: Okay, talk to you later.
+I just opened the format.
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+give it a try, yeah.
-00:01:54.603 --> 00:01:55.283
+Okay, talk to you later.
-Speaker A: See you.
\ No newline at end of file
+See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
new file mode 100644
index 000000000..93feba5e9
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
@@ -0,0 +1,14 @@
+item-0 at level 0: unspecified: group _root_
+ item-1 at level 1: text: Last night the chef surprised us with a culinary adventure.
+ item-2 at level 1: inline: group WebVTT cue span
+ item-3 at level 2: text: The waiter offered a
+ item-4 at level 2: text: steaming bowl of
+ item-5 at level 2: text: paella
+ item-6 at level 2: text: that instantly transported the diners to a sunny Mediterranean coast.
+ item-7 at level 1: inline: group WebVTT cue span
+ item-8 at level 2: text: The dessert’s
+ item-9 at level 2: text: unexpected
+ item-10 at level 2: text:
+ item-11 at level 2: text: arcobaleno
+ item-12 at level 2: text: of flavors
+ item-13 at level 2: text: left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
new file mode 100644
index 000000000..17ab9f501
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
@@ -0,0 +1,344 @@
+{
+ "schema_name": "DoclingDocument",
+ "version": "1.8.0",
+ "name": "webvtt_example_04",
+ "origin": {
+ "mimetype": "text/vtt",
+ "binary_hash": 5389775195091554844,
+ "filename": "webvtt_example_04.vtt"
+ },
+ "furniture": {
+ "self_ref": "#/furniture",
+ "children": [],
+ "content_layer": "furniture",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "body": {
+ "self_ref": "#/body",
+ "children": [
+ {
+ "$ref": "#/texts/0"
+ },
+ {
+ "$ref": "#/groups/0"
+ },
+ {
+ "$ref": "#/groups/1"
+ }
+ ],
+ "content_layer": "body",
+ "name": "_root_",
+ "label": "unspecified"
+ },
+ "groups": [
+ {
+ "self_ref": "#/groups/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/1"
+ },
+ {
+ "$ref": "#/texts/2"
+ },
+ {
+ "$ref": "#/texts/3"
+ },
+ {
+ "$ref": "#/texts/4"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue span",
+ "label": "inline"
+ },
+ {
+ "self_ref": "#/groups/1",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [
+ {
+ "$ref": "#/texts/5"
+ },
+ {
+ "$ref": "#/texts/6"
+ },
+ {
+ "$ref": "#/texts/7"
+ },
+ {
+ "$ref": "#/texts/8"
+ },
+ {
+ "$ref": "#/texts/9"
+ },
+ {
+ "$ref": "#/texts/10"
+ }
+ ],
+ "content_layer": "body",
+ "name": "WebVTT cue span",
+ "label": "inline"
+ }
+ ],
+ "texts": [
+ {
+ "self_ref": "#/texts/0",
+ "parent": {
+ "$ref": "#/body"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14580.0,
+ "end_time": 14760.0,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "Last night the chef surprised us with a culinary adventure.",
+ "text": "Last night the chef surprised us with a culinary adventure."
+ },
+ {
+ "self_ref": "#/texts/1",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "The waiter offered a ",
+ "text": "The waiter offered a "
+ },
+ {
+ "self_ref": "#/texts/2",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "steaming bowl of ",
+ "text": "steaming bowl of ",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/3",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234",
+ "languages": [
+ "es-ES"
+ ]
+ }
+ ],
+ "orig": "paella",
+ "text": "paella",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/4",
+ "parent": {
+ "$ref": "#/groups/0"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+ "text": " that instantly transported the diners to a sunny Mediterranean coast."
+ },
+ {
+ "self_ref": "#/texts/5",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": "The dessert’s ",
+ "text": "The dessert’s "
+ },
+ {
+ "self_ref": "#/texts/6",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234",
+ "classes": [
+ "b.loud"
+ ]
+ }
+ ],
+ "orig": "unexpected",
+ "text": "unexpected",
+ "formatting": {
+ "bold": true,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/7",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " ",
+ "text": " ",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/8",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234",
+ "languages": [
+ "it"
+ ]
+ }
+ ],
+ "orig": "arcobaleno",
+ "text": "arcobaleno",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": true,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/9",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " of flavors",
+ "text": " of flavors",
+ "formatting": {
+ "bold": false,
+ "italic": true,
+ "underline": false,
+ "strikethrough": false,
+ "script": "baseline"
+ }
+ },
+ {
+ "self_ref": "#/texts/10",
+ "parent": {
+ "$ref": "#/groups/1"
+ },
+ "children": [],
+ "content_layer": "body",
+ "label": "text",
+ "prov": [
+ {
+ "start_time": 14760.0,
+ "end_time": 14818.239,
+ "identifier": "agcvs-08234"
+ }
+ ],
+ "orig": " left everyone in awe.",
+ "text": " left everyone in awe."
+ }
+ ],
+ "pictures": [],
+ "tables": [],
+ "key_value_items": [],
+ "form_items": [],
+ "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
new file mode 100644
index 000000000..f2312a059
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
@@ -0,0 +1,5 @@
+Last night the chef surprised us with a culinary adventure.
+
+The waiter offered a *steaming bowl of * *paella* that instantly transported the diners to a sunny Mediterranean coast.
+
+The dessert’s ***unexpected*** * * *arcobaleno* * of flavors* left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt
index 1152a1e8f..6bd182101 100644
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -12,4 +12,7 @@ NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
Hee! laughter
00:06.000 --> 00:08.000
-That’s awesome!
\ No newline at end of file
+That’s awesome!
+
+00:08.000 --> 00:10.000
+Sur les playground, ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_04.vtt b/tests/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 000000000..fd7b788c0
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a steaming bowl of paella that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s unexpected arcobaleno of flavors left everyone in awe.
\ No newline at end of file
diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py
index a910671bb..54e91219d 100644
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -1,19 +1,7 @@
-# Assisted by watsonx Code Assistant
-
from pathlib import Path
-import pytest
from docling_core.types.doc import DoclingDocument
-from pydantic import ValidationError
-from docling.backend.webvtt_backend import (
- _WebVTTCueItalicSpan,
- _WebVTTCueTextSpan,
- _WebVTTCueTimings,
- _WebVTTCueVoiceSpan,
- _WebVTTFile,
- _WebVTTTimestamp,
-)
from docling.datamodel.base_models import InputFormat
from docling.datamodel.document import ConversionResult
from docling.document_converter import DocumentConverter
@@ -24,187 +12,6 @@
GENERATE = GEN_TEST_DATA
-def test_vtt_cue_commponents():
- """Test WebVTT components."""
- valid_timestamps = [
- "00:01:02.345",
- "12:34:56.789",
- "02:34.567",
- "00:00:00.000",
- ]
- valid_total_seconds = [
- 1 * 60 + 2.345,
- 12 * 3600 + 34 * 60 + 56.789,
- 2 * 60 + 34.567,
- 0.0,
- ]
- for idx, ts in enumerate(valid_timestamps):
- model = _WebVTTTimestamp(raw=ts)
- assert model.seconds == valid_total_seconds[idx]
-
- """Test invalid WebVTT timestamps."""
- invalid_timestamps = [
- "00:60:02.345", # minutes > 59
- "00:01:60.345", # seconds > 59
- "00:01:02.1000", # milliseconds > 999
- "01:02:03", # missing milliseconds
- "01:02", # missing milliseconds
- ":01:02.345", # extra : for missing hours
- "abc:01:02.345", # invalid format
- ]
- for ts in invalid_timestamps:
- with pytest.raises(ValidationError):
- _WebVTTTimestamp(raw=ts)
-
- """Test the timestamp __str__ method."""
- model = _WebVTTTimestamp(raw="00:01:02.345")
- assert str(model) == "00:01:02.345"
-
- """Test valid cue timings."""
- start = _WebVTTTimestamp(raw="00:10.005")
- end = _WebVTTTimestamp(raw="00:14.007")
- cue_timings = _WebVTTCueTimings(start=start, end=end)
- assert cue_timings.start == start
- assert cue_timings.end == end
- assert str(cue_timings) == "00:10.005 --> 00:14.007"
-
- """Test invalid cue timings with end timestamp before start."""
- start = _WebVTTTimestamp(raw="00:10.700")
- end = _WebVTTTimestamp(raw="00:10.500")
- with pytest.raises(ValidationError) as excinfo:
- _WebVTTCueTimings(start=start, end=end)
- assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
-
- """Test invalid cue timings with missing end."""
- start = _WebVTTTimestamp(raw="00:10.500")
- with pytest.raises(ValidationError) as excinfo:
- _WebVTTCueTimings(start=start)
- assert "Field required" in str(excinfo.value)
-
- """Test invalid cue timings with missing start."""
- end = _WebVTTTimestamp(raw="00:10.500")
- with pytest.raises(ValidationError) as excinfo:
- _WebVTTCueTimings(end=end)
- assert "Field required" in str(excinfo.value)
-
- """Test with valid text."""
- valid_text = "This is a valid cue text span."
- span = _WebVTTCueTextSpan(text=valid_text)
- assert span.text == valid_text
- assert str(span) == valid_text
-
- """Test with text containing newline characters."""
- invalid_text = "This cue text span\ncontains a newline."
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text=invalid_text)
-
- """Test with text containing ampersand."""
- invalid_text = "This cue text span contains &."
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text=invalid_text)
-
- """Test with text containing less-than sign."""
- invalid_text = "This cue text span contains <."
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text=invalid_text)
-
- """Test with empty text."""
- with pytest.raises(ValidationError):
- _WebVTTCueTextSpan(text="")
-
- """Test that annotation validation works correctly."""
- valid_annotation = "valid-annotation"
- invalid_annotation = "invalid\nannotation"
- with pytest.raises(ValidationError):
- _WebVTTCueVoiceSpan(annotation=invalid_annotation)
- assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
-
- """Test that classes validation works correctly."""
- annotation = "speaker name"
- valid_classes = ["class1", "class2"]
- invalid_classes = ["class\nwith\nnewlines", ""]
- with pytest.raises(ValidationError):
- _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
- assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
-
- """Test that components validation works correctly."""
- annotation = "speaker name"
- valid_components = [_WebVTTCueTextSpan(text="random text")]
- invalid_components = [123, "not a component"]
- with pytest.raises(ValidationError):
- _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
- assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
-
- """Test valid cue voice spans."""
- cue_span = _WebVTTCueVoiceSpan(
- annotation="speaker",
- classes=["loud", "clear"],
- components=[_WebVTTCueTextSpan(text="random text")],
- )
-
- expected_str = "random text"
- assert str(cue_span) == expected_str
-
- cue_span = _WebVTTCueVoiceSpan(
- annotation="speaker",
- components=[_WebVTTCueTextSpan(text="random text")],
- )
- expected_str = "random text"
- assert str(cue_span) == expected_str
-
-
-def test_webvtt_file():
- """Test WebVTT files."""
- with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
- content = f.read()
- vtt = _WebVTTFile.parse(content)
- assert len(vtt) == 13
- block = vtt.cue_blocks[11]
- assert str(block.timings) == "00:32.500 --> 00:33.500"
- assert len(block.payload) == 1
- cue_span = block.payload[0]
- assert isinstance(cue_span, _WebVTTCueVoiceSpan)
- assert cue_span.annotation == "Neil deGrasse Tyson"
- assert not cue_span.classes
- assert len(cue_span.components) == 1
- comp = cue_span.components[0]
- assert isinstance(comp, _WebVTTCueItalicSpan)
- assert len(comp.components) == 1
- comp2 = comp.components[0]
- assert isinstance(comp2, _WebVTTCueTextSpan)
- assert comp2.text == "Laughs"
-
- with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
- content = f.read()
- vtt = _WebVTTFile.parse(content)
- assert len(vtt) == 4
- reverse = (
- "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
- "https://www.w3.org/TR/webvtt1/\n\n"
- )
- reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
- assert content == reverse
-
- with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
- content = f.read()
- vtt = _WebVTTFile.parse(content)
- assert len(vtt) == 13
- for block in vtt:
- assert block.identifier
- block = vtt.cue_blocks[0]
- assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
- assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
- assert len(block.payload) == 1
- assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
- block = vtt.cue_blocks[2]
- assert isinstance(cue_span, _WebVTTCueVoiceSpan)
- assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
- assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
- assert len(block.payload) == 1
- assert isinstance(block.payload[0], _WebVTTCueTextSpan)
- assert block.payload[0].text == "Good."
-
-
def test_e2e_vtt_conversions():
directory = Path("./tests/data/webvtt/")
vtt_paths = sorted(directory.rglob("*.vtt"))
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 93f33e1fd..ad7eafa98 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -10,6 +10,8 @@
DoclingDocument,
FormulaItem,
PictureItem,
+ ProvenanceItem,
+ ProvenanceTrack,
TableItem,
TextItem,
)
@@ -237,7 +239,30 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
true_prov = true_item.prov[0]
pred_prov = pred_item.prov[0]
- assert true_prov.page_no == pred_prov.page_no, "Page provenance mistmatch"
+ assert type(pred_prov) is type(true_prov), "Provenance type mismatch"
+ if isinstance(pred_prov, ProvenanceItem):
+ assert true_prov.page_no == pred_prov.page_no, (
+ "Page provenance mistmatch"
+ )
+ elif isinstance(pred_prov, ProvenanceTrack):
+ assert true_prov.start_time._seconds == pred_prov.start_time._seconds, (
+ "ProvenanceTrack start time mismatch"
+ )
+ assert true_prov.end_time._seconds == pred_prov.end_time._seconds, (
+ "ProvenanceTrack end time mismatch"
+ )
+ assert true_prov.languages == pred_prov.languages, (
+ "ProvenanceTrack languages mismatch"
+ )
+ assert true_prov.classes == pred_prov.classes, (
+ "ProvenanceTrack classes mismatch"
+ )
+ assert true_prov.identifier == pred_prov.identifier, (
+ "ProvenanceTrack identifier mismatch"
+ )
+ assert true_prov.voice == pred_prov.voice, (
+ "ProvenanceTrack voice mismatch"
+ )
# TODO: add bbox check with tolerance
diff --git a/uv.lock b/uv.lock
index 6548b79f4..cd0663a3d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1546,7 +1546,7 @@ requires-dist = [
{ name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
{ name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
{ name = "certifi", specifier = ">=2024.7.4" },
- { name = "docling-core", extras = ["chunking"], specifier = ">=2.50.1,<3.0.0" },
+ { name = "docling-core", extras = ["chunking"], git = "ssh://git@github.com/docling-project/docling-core.git?rev=c75516516358f25add2682674fc7dc6eef2c5164" },
{ name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
{ name = "docling-parse", specifier = ">=4.7.0,<5.0.0" },
{ name = "easyocr", marker = "extra == 'easyocr'", specifier = ">=1.7,<2.0" },
@@ -1631,8 +1631,8 @@ examples = [
[[package]]
name = "docling-core"
-version = "2.51.1"
-source = { registry = "https://pypi.org/simple" }
+version = "2.55.0"
+source = { git = "ssh://git@github.com/docling-project/docling-core.git?rev=c75516516358f25add2682674fc7dc6eef2c5164#c75516516358f25add2682674fc7dc6eef2c5164" }
dependencies = [
{ name = "jsonref" },
{ name = "jsonschema" },
@@ -1645,10 +1645,6 @@ dependencies = [
{ name = "typer" },
{ name = "typing-extensions" },
]
-sdist = { url = "https://files.pythonhosted.org/packages/33/76/665a61f6208923fb312549d9c7a2ef5275bdd7fd4d83cbe8ddd668f2fa35/docling_core-2.51.1.tar.gz", hash = "sha256:f5b0d8ead535c8451f67f9545af007f5bebfda72744a8e90af6e83fb6a483a99", size = 184664, upload-time = "2025-11-14T13:33:48.586Z" }
-wheels = [
- { url = "https://files.pythonhosted.org/packages/a6/06/911a7374d59afff0dd8b50f84e1b7e5c4452886bbbe0e31e04510f44d43e/docling_core-2.51.1-py3-none-any.whl", hash = "sha256:76ca2b4c5c1d33475583671fe584b390e769152cac48d1fb24bf5a7457864a66", size = 186005, upload-time = "2025-11-14T13:33:46.695Z" },
-]
[package.optional-dependencies]
chunking = [
@@ -6119,6 +6115,9 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
{ url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
{ url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
+ { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
+ { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
+ { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
{ url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
{ url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
{ url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
@@ -6140,6 +6139,8 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
{ url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
{ url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
+ { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
+ { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
{ url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
{ url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
{ url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },