diff --git a/docling/backend/msexcel_backend.py b/docling/backend/msexcel_backend.py
index 2bcc34d76..c77919559 100644
--- a/docling/backend/msexcel_backend.py
+++ b/docling/backend/msexcel_backend.py
@@ -669,6 +669,8 @@ def _find_page_size(
             if not isinstance(item, DocItem):
                 continue
             for provenance in item.prov:
+                if not isinstance(provenance, ProvenanceItem):
+                    continue
                 bbox = provenance.bbox
                 left = min(left, bbox.l) if left != -1 else bbox.l
                 right = max(right, bbox.r) if right != -1 else bbox.r
diff --git a/docling/backend/webvtt_backend.py b/docling/backend/webvtt_backend.py
index 2a7d02ce7..e61cd0f34 100644
--- a/docling/backend/webvtt_backend.py
+++ b/docling/backend/webvtt_backend.py
@@ -1,8 +1,9 @@
+import copy
 import logging
-import re
+from dataclasses import dataclass, field
 from io import BytesIO
 from pathlib import Path
-from typing import Annotated, ClassVar, Literal, Optional, Union, cast
+from typing import Literal, Optional, Union
 
 from docling_core.types.doc import (
     ContentLayer,
@@ -10,12 +11,20 @@
     DoclingDocument,
     DocumentOrigin,
     Formatting,
-    GroupLabel,
-    NodeItem,
+    ProvenanceTrack,
 )
-from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
-from pydantic.types import StringConstraints
-from typing_extensions import Self, override
+from docling_core.types.doc.webvtt import (
+    WebVTTCueBoldSpan,
+    WebVTTCueComponent,
+    WebVTTCueComponentWithTerminator,
+    WebVTTCueItalicSpan,
+    WebVTTCueLanguageSpan,
+    WebVTTCueTextSpan,
+    WebVTTCueUnderlineSpan,
+    WebVTTCueVoiceSpan,
+    WebVTTFile,
+)
+from typing_extensions import override
 
 from docling.backend.abstract_backend import DeclarativeDocumentBackend
 from docling.datamodel.base_models import InputFormat
@@ -24,409 +33,29 @@
 _log = logging.getLogger(__name__)
 
 
-class _WebVTTTimestamp(BaseModel):
-    """Model representing a WebVTT timestamp.
-
-    A WebVTT timestamp is always interpreted relative to the current playback position
-    of the media data that the WebVTT file is to be synchronized with.
-    """
-
-    model_config = ConfigDict(regex_engine="python-re")
-
-    raw: Annotated[
-        str,
-        Field(
-            description="A representation of the WebVTT Timestamp as a single string"
-        ),
-    ]
-
-    _pattern: ClassVar[re.Pattern] = re.compile(
-        r"^(?:(\d{2,}):)?([0-5]\d):([0-5]\d)\.(\d{3})$"
-    )
-    _hours: int
-    _minutes: int
-    _seconds: int
-    _millis: int
-
-    @model_validator(mode="after")
-    def validate_raw(self) -> Self:
-        m = self._pattern.match(self.raw)
-        if not m:
-            raise ValueError(f"Invalid WebVTT timestamp format: {self.raw}")
-        self._hours = int(m.group(1)) if m.group(1) else 0
-        self._minutes = int(m.group(2))
-        self._seconds = int(m.group(3))
-        self._millis = int(m.group(4))
-
-        if self._minutes < 0 or self._minutes > 59:
-            raise ValueError("Minutes must be between 0 and 59")
-        if self._seconds < 0 or self._seconds > 59:
-            raise ValueError("Seconds must be between 0 and 59")
-
-        return self
-
-    @property
-    def seconds(self) -> float:
-        """A representation of the WebVTT Timestamp in seconds"""
-        return (
-            self._hours * 3600
-            + self._minutes * 60
-            + self._seconds
-            + self._millis / 1000.0
-        )
-
-    @override
-    def __str__(self) -> str:
-        return self.raw
-
-
-_WebVTTCueIdentifier = Annotated[
-    str, StringConstraints(strict=True, pattern=r"^(?!.*-->)[^\n\r]+$")
-]
-
-
-class _WebVTTCueTimings(BaseModel):
-    """Model representating WebVTT cue timings."""
-
-    start: Annotated[
-        _WebVTTTimestamp, Field(description="Start time offset of the cue")
-    ]
-    end: Annotated[_WebVTTTimestamp, Field(description="End time offset of the cue")]
-
-    @model_validator(mode="after")
-    def check_order(self) -> Self:
-        if self.start and self.end:
-            if self.end.seconds <= self.start.seconds:
-                raise ValueError("End timestamp must be greater than start timestamp")
-        return self
-
-    @override
-    def __str__(self):
-        return f"{self.start} --> {self.end}"
-
-
-class _WebVTTCueTextSpan(BaseModel):
-    """Model representing a WebVTT cue text span."""
-
+@dataclass
+class AnnotatedText:
     text: str
-    span_type: Literal["text"] = "text"
-
-    @field_validator("text", mode="after")
-    @classmethod
-    def validate_text(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", "<"}):
-            raise ValueError("Cue text span contains invalid characters")
-        if len(value) == 0:
-            raise ValueError("Cue text span cannot be empty")
-        return value
-
-    @override
-    def __str__(self):
-        return self.text
-
-
-class _WebVTTCueVoiceSpan(BaseModel):
-    """Model representing a WebVTT cue voice span."""
-
-    annotation: Annotated[
-        str,
-        Field(
-            description=(
-                "Cue span start tag annotation text representing the name of thevoice"
-            )
-        ),
-    ]
-    classes: Annotated[
-        list[str],
-        Field(description="List of classes representing the cue span's significance"),
-    ] = []
-    components: Annotated[
-        list["_WebVTTCueComponent"],
-        Field(description="The components representing the cue internal text"),
-    ] = []
-    span_type: Literal["v"] = "v"
-
-    @field_validator("annotation", mode="after")
-    @classmethod
-    def validate_annotation(cls, value: str) -> str:
-        if any(ch in value for ch in {"\n", "\r", "&", ">"}):
-            raise ValueError(
-                "Cue span start tag annotation contains invalid characters"
-            )
-        if not value:
-            raise ValueError("Cue text span cannot be empty")
-        return value
-
-    @field_validator("classes", mode="after")
-    @classmethod
-    def validate_classes(cls, value: list[str]) -> list[str]:
-        for item in value:
-            if any(ch in item for ch in {"\t", "\n", "\r", " ", "&", "<", ">", "."}):
-                raise ValueError(
-                    "A cue span start tag class contains invalid characters"
-                )
-            if not item:
-                raise ValueError("Cue span start tag classes cannot be empty")
-        return value
-
-    @override
-    def __str__(self):
-        tag = f"v.{'.'.join(self.classes)}" if self.classes else "v"
-        inner = "".join(str(span) for span in self.components)
-        return f"<{tag} {self.annotation}>{inner}</v>"
-
-
-class _WebVTTCueClassSpan(BaseModel):
-    span_type: Literal["c"] = "c"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<c>{inner}</c>"
-
-
-class _WebVTTCueItalicSpan(BaseModel):
-    span_type: Literal["i"] = "i"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<i>{inner}</i>"
-
-
-class _WebVTTCueBoldSpan(BaseModel):
-    span_type: Literal["b"] = "b"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<b>{inner}</b>"
-
-
-class _WebVTTCueUnderlineSpan(BaseModel):
-    span_type: Literal["u"] = "u"
-    components: list["_WebVTTCueComponent"]
-
-    @override
-    def __str__(self):
-        inner = "".join(str(span) for span in self.components)
-        return f"<u>{inner}</u>"
-
-
-_WebVTTCueComponent = Annotated[
-    Union[
-        _WebVTTCueTextSpan,
-        _WebVTTCueClassSpan,
-        _WebVTTCueItalicSpan,
-        _WebVTTCueBoldSpan,
-        _WebVTTCueUnderlineSpan,
-        _WebVTTCueVoiceSpan,
-    ],
-    Field(discriminator="span_type", description="The WebVTT cue component"),
-]
-
-
-class _WebVTTCueBlock(BaseModel):
-    """Model representing a WebVTT cue block.
-
-    The optional WebVTT cue settings list is not supported.
-    The cue payload is limited to the following spans: text, class, italic, bold,
-    underline, and voice.
-    """
-
-    model_config = ConfigDict(regex_engine="python-re")
-
-    identifier: Optional[_WebVTTCueIdentifier] = Field(
-        None, description="The WebVTT cue identifier"
+    voice: Optional[str] = None
+    formatting: Optional[Formatting] = None
+    classes: dict[Literal["b", "u", "i", "lang", "v"], list[str]] = field(
+        default_factory=dict
     )
-    timings: Annotated[_WebVTTCueTimings, Field(description="The WebVTT cue timings")]
-    payload: Annotated[list[_WebVTTCueComponent], Field(description="The cue payload")]
-
-    _pattern_block: ClassVar[re.Pattern] = re.compile(
-        r"<(/?)(i|b|c|u|v(?:\.[^\t\n\r &<>.]+)*)(?:\s+([^>]*))?>"
-    )
-    _pattern_voice_tag: ClassVar[re.Pattern] = re.compile(
-        r"^<v(?P<class>\.[^\t\n\r &<>]+)?"  # zero or more classes
-        r"[ \t]+(?P<annotation>[^\n\r&>]+)>"  # required space and annotation
-    )
-
-    @field_validator("payload", mode="after")
-    @classmethod
-    def validate_payload(cls, payload):
-        for voice in payload:
-            if "-->" in str(voice):
-                raise ValueError("Cue payload must not contain '-->'")
-        return payload
-
-    @classmethod
-    def parse(cls, raw: str) -> "_WebVTTCueBlock":
-        lines = raw.strip().splitlines()
-        if not lines:
-            raise ValueError("Cue block must have at least one line")
-        identifier: Optional[_WebVTTCueIdentifier] = None
-        timing_line = lines[0]
-        if "-->" not in timing_line and len(lines) > 1:
-            identifier = timing_line
-            timing_line = lines[1]
-            cue_lines = lines[2:]
-        else:
-            cue_lines = lines[1:]
-
-        if "-->" not in timing_line:
-            raise ValueError("Cue block must contain WebVTT cue timings")
-
-        start, end = [t.strip() for t in timing_line.split("-->")]
-        end = re.split(" |\t", end)[0]  # ignore the cue settings list
-        timings: _WebVTTCueTimings = _WebVTTCueTimings(
-            start=_WebVTTTimestamp(raw=start), end=_WebVTTTimestamp(raw=end)
+    lang: set[str] = field(default_factory=set)
+
+    def copy_meta(self, text):
+        return AnnotatedText(
+            text=text,
+            voice=self.voice,
+            formatting=self.formatting.model_copy() if self.formatting else None,
+            classes=copy.deepcopy(self.classes),
+            lang=self.lang.copy(),
         )
-        cue_text = " ".join(cue_lines).strip()
-        if cue_text.startswith("<v") and "</v>" not in cue_text:
-            # adding close tag for cue voice spans without end tag
-            cue_text += "</v>"
-
-        stack: list[list[_WebVTTCueComponent]] = [[]]
-        tag_stack: list[Union[str, tuple]] = []
-
-        pos = 0
-        matches = list(cls._pattern_block.finditer(cue_text))
-        i = 0
-        while i < len(matches):
-            match = matches[i]
-            if match.start() > pos:
-                stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos : match.start()]))
-            tag = match.group(0)
-
-            if tag.startswith(("<i>", "<b>", "<u>", "<c>")):
-                tag_type = tag[1:2]
-                tag_stack.append(tag_type)
-                stack.append([])
-            elif tag == "</i>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueItalicSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</b>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueBoldSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</u>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueUnderlineSpan(components=children))
-                tag_stack.pop()
-            elif tag == "</c>":
-                children = stack.pop()
-                stack[-1].append(_WebVTTCueClassSpan(components=children))
-                tag_stack.pop()
-            elif tag.startswith("<v"):
-                tag_stack.append(("v", tag))
-                stack.append([])
-            elif tag.startswith("</v"):
-                children = stack.pop() if stack else []
-                if (
-                    tag_stack
-                    and isinstance(tag_stack[-1], tuple)
-                    and tag_stack[-1][0] == "v"
-                ):
-                    _, voice = cast(tuple, tag_stack.pop())
-                    voice_match = cls._pattern_voice_tag.match(voice)
-                    if voice_match:
-                        class_string = voice_match.group("class")
-                        annotation = voice_match.group("annotation")
-                        if annotation:
-                            classes: list[str] = []
-                            if class_string:
-                                classes = [c for c in class_string.split(".") if c]
-                            stack[-1].append(
-                                _WebVTTCueVoiceSpan(
-                                    annotation=annotation.strip(),
-                                    classes=classes,
-                                    components=children,
-                                )
-                            )
-
-            pos = match.end()
-            i += 1
-
-        if pos < len(cue_text):
-            stack[-1].append(_WebVTTCueTextSpan(text=cue_text[pos:]))
-
-        return cls(
-            identifier=identifier,
-            timings=timings,
-            payload=stack[0],
-        )
-
-    def __str__(self):
-        parts = []
-        if self.identifier:
-            parts.append(f"{self.identifier}\n")
-        timings_line = str(self.timings)
-        parts.append(timings_line + "\n")
-        for idx, span in enumerate(self.payload):
-            if idx == 0 and len(self.payload) == 1 and span.span_type == "v":
-                # the end tag may be omitted for brevity
-                parts.append(str(span).removesuffix("</v>"))
-            else:
-                parts.append(str(span))
-
-        return "".join(parts)
-
-
-class _WebVTTFile(BaseModel):
-    """A model representing a WebVTT file."""
-
-    cue_blocks: list[_WebVTTCueBlock]
-
-    @staticmethod
-    def verify_signature(content: str) -> bool:
-        if not content:
-            return False
-        elif len(content) == 6:
-            return content == "WEBVTT"
-        elif len(content) > 6 and content.startswith("WEBVTT"):
-            return content[6] in (" ", "\t", "\n")
-        else:
-            return False
-
-    @classmethod
-    def parse(cls, raw: str) -> "_WebVTTFile":
-        # Normalize newlines to LF
-        raw = raw.replace("\r\n", "\n").replace("\r", "\n")
-
-        # Check WebVTT signature
-        if not cls.verify_signature(raw):
-            raise ValueError("Invalid WebVTT file signature")
-
-        # Strip "WEBVTT" header line
-        lines = raw.split("\n", 1)
-        body = lines[1] if len(lines) > 1 else ""
-
-        # Remove NOTE/STYLE/REGION blocks
-        body = re.sub(r"^(NOTE[^\n]*\n(?:.+\n)*?)\n", "", body, flags=re.MULTILINE)
-        body = re.sub(r"^(STYLE|REGION)(?:.+\n)*?\n", "", body, flags=re.MULTILINE)
-
-        # Split into cue blocks
-        raw_blocks = re.split(r"\n\s*\n", body.strip())
-        cues: list[_WebVTTCueBlock] = []
-        for block in raw_blocks:
-            try:
-                cues.append(_WebVTTCueBlock.parse(block))
-            except ValueError as e:
-                _log.warning(f"Failed to parse cue block:\n{block}\n{e}")
-
-        return cls(cue_blocks=cues)
-
-    def __iter__(self):
-        return iter(self.cue_blocks)
 
-    def __getitem__(self, idx):
-        return self.cue_blocks[idx]
 
-    def __len__(self):
-        return len(self.cue_blocks)
+@dataclass
+class AnnotatedPar:
+    items: list[AnnotatedText]
 
 
 class WebVTTDocumentBackend(DeclarativeDocumentBackend):
@@ -458,7 +87,7 @@ def __init__(self, in_doc: InputDocument, path_or_stream: Union[BytesIO, Path]):
 
     @override
     def is_valid(self) -> bool:
-        return _WebVTTFile.verify_signature(self.content)
+        return WebVTTFile.verify_signature(self.content)
 
     @classmethod
     @override
@@ -477,36 +106,18 @@ def supported_formats(cls) -> set[InputFormat]:
         return {InputFormat.VTT}
 
     @staticmethod
-    def _add_text_from_component(
-        doc: DoclingDocument, item: _WebVTTCueComponent, parent: Optional[NodeItem]
+    def _add_classes(
+        item: AnnotatedText,
+        key: Literal["b", "u", "i", "lang", "v"],
+        classes: list[str],
     ) -> None:
-        """Adds a TextItem to a document by extracting text from a cue span component.
-
-        TODO: address nesting
-        """
-        formatting = Formatting()
-        text = ""
-        if isinstance(item, _WebVTTCueItalicSpan):
-            formatting.italic = True
-        elif isinstance(item, _WebVTTCueBoldSpan):
-            formatting.bold = True
-        elif isinstance(item, _WebVTTCueUnderlineSpan):
-            formatting.underline = True
-        if isinstance(item, _WebVTTCueTextSpan):
-            text = item.text
-        else:
-            # TODO: address nesting
-            text = "".join(
-                [t.text for t in item.components if isinstance(t, _WebVTTCueTextSpan)]
-            )
-        if text := text.strip():
-            doc.add_text(
-                label=DocItemLabel.TEXT,
-                text=text,
-                parent=parent,
-                content_layer=ContentLayer.BODY,
-                formatting=formatting,
-            )
+        if not classes:
+            return
+
+        bucket = item.classes.setdefault(key, [])
+        for cls in classes:
+            if cls not in bucket:
+                bucket.append(cls)
 
     @override
     def convert(self) -> DoclingDocument:
@@ -521,52 +132,115 @@ def convert(self) -> DoclingDocument:
         )
         doc = DoclingDocument(name=self.file.stem or "file", origin=origin)
 
-        vtt: _WebVTTFile = _WebVTTFile.parse(self.content)
-        for block in vtt.cue_blocks:
-            block_group = doc.add_group(
-                label=GroupLabel.SECTION,
-                name="WebVTT cue block",
-                parent=None,
-                content_layer=ContentLayer.BODY,
-            )
-            if block.identifier:
-                doc.add_text(
-                    label=DocItemLabel.TEXT,
-                    text=str(block.identifier),
-                    parent=block_group,
-                    content_layer=ContentLayer.BODY,
+        vtt: WebVTTFile = WebVTTFile.parse(self.content)
+        cue_text: list[AnnotatedPar] = []
+        parents: list[AnnotatedText] = []
+
+        def _extract_components(
+            payload: list[WebVTTCueComponentWithTerminator],
+        ) -> None:
+            nonlocal cue_text, parents
+            if not cue_text:
+                cue_text.append(AnnotatedPar(items=[]))
+            par = cue_text[-1]
+            for comp in payload:
+                item: AnnotatedText = (
+                    parents[-1].copy_meta("") if parents else AnnotatedText(text="")
                 )
+                component: WebVTTCueComponent = comp.component
+                if isinstance(component, WebVTTCueTextSpan):
+                    item.text = component.text
+                    par.items.append(item)
+                else:
+                    # configure metadata based on span type
+                    if isinstance(component, WebVTTCueBoldSpan):
+                        item.formatting = item.formatting or Formatting()
+                        item.formatting.bold = True
+                        self._add_classes(item, "b", component.start_tag.classes)
+
+                    elif isinstance(component, WebVTTCueItalicSpan):
+                        item.formatting = item.formatting or Formatting()
+                        item.formatting.italic = True
+                        self._add_classes(item, "i", component.start_tag.classes)
+
+                    elif isinstance(component, WebVTTCueUnderlineSpan):
+                        item.formatting = item.formatting or Formatting()
+                        item.formatting.underline = True
+                        self._add_classes(item, "u", component.start_tag.classes)
+
+                    elif isinstance(component, WebVTTCueLanguageSpan):
+                        item.lang.add(component.start_tag.annotation)
+                        self._add_classes(item, "lang", component.start_tag.classes)
+
+                    elif isinstance(component, WebVTTCueVoiceSpan):
+                        # voice spans cannot be embedded
+                        item.voice = component.start_tag.annotation
+                        self._add_classes(item, "v", component.start_tag.classes)
+
+                    parents.append(item)
+                    _extract_components(component.internal_text.components)
+                    parents.pop()
+
+                if comp.terminator is not None:
+                    cue_text.append(AnnotatedPar(items=[]))
+                    par = cue_text[-1]
+
+        def _add_text_item(
+            text: str,
+            formatting: Optional[Formatting],
+            item: AnnotatedText,
+            parent=None,
+        ):
+            languages = list(item.lang) if item.lang else None
+            classes = (
+                [".".join([k, *v]) for k, v in item.classes.items()]
+                if item.classes
+                else None
+            )
+
+            track = ProvenanceTrack(
+                start_time=block.timings.start.seconds,
+                end_time=block.timings.end.seconds,
+                identifier=identifier,
+                languages=languages,
+                classes=classes,
+                voice=item.voice or None,
+            )
+
             doc.add_text(
                 label=DocItemLabel.TEXT,
-                text=str(block.timings),
-                parent=block_group,
+                text=text,
                 content_layer=ContentLayer.BODY,
+                prov=track,
+                formatting=formatting,
+                parent=parent,
             )
-            for cue_span in block.payload:
-                if isinstance(cue_span, _WebVTTCueVoiceSpan):
-                    voice_group = doc.add_group(
-                        label=GroupLabel.INLINE,
-                        name="WebVTT cue voice span",
-                        parent=block_group,
-                        content_layer=ContentLayer.BODY,
-                    )
-                    voice = cue_span.annotation
-                    if classes := cue_span.classes:
-                        voice += f" ({', '.join(classes)})"
-                    voice += ": "
-                    doc.add_text(
-                        label=DocItemLabel.TEXT,
-                        text=voice,
-                        parent=voice_group,
-                        content_layer=ContentLayer.BODY,
+
+        for block in vtt.cue_blocks:
+            cue_text = []
+            parents = []
+            identifier = str(block.identifier) if block.identifier else None
+            _extract_components(block.payload)
+            for par in cue_text:
+                if not par.items:
+                    continue
+                if len(par.items) == 1:
+                    item = par.items[0]
+                    _add_text_item(
+                        text=item.text,
+                        formatting=item.formatting,
+                        item=item,
                     )
-                    for item in cue_span.components:
-                        WebVTTDocumentBackend._add_text_from_component(
-                            doc, item, voice_group
-                        )
                 else:
-                    WebVTTDocumentBackend._add_text_from_component(
-                        doc, cue_span, block_group
+                    group = doc.add_inline_group(
+                        "WebVTT cue span", content_layer=ContentLayer.BODY
                     )
+                    for item in par.items:
+                        _add_text_item(
+                            text=item.text,
+                            formatting=item.formatting,
+                            item=item,
+                            parent=group,
+                        )
 
         return doc
diff --git a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
index db73db8db..70434fd8d 100644
--- a/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
+++ b/docling/experimental/pipeline/threaded_layout_vlm_pipeline.py
@@ -12,8 +12,7 @@
 from pathlib import Path
 from typing import TYPE_CHECKING, List, Optional, Union, cast
 
-from docling_core.types.doc import DoclingDocument
-from docling_core.types.doc.document import DocTagsDocument
+from docling_core.types.doc import DoclingDocument, DocTagsDocument, ProvenanceItem
 from PIL import Image as PILImage
 
 if TYPE_CHECKING:
@@ -371,13 +370,17 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                 page_map = {p.page_no: p for p in conv_res.pages}
                 scale = self.pipeline_options.images_scale
                 for element, _level in conv_res.document.iterate_items():
-                    if not isinstance(element, DocItem) or len(element.prov) == 0:
+                    if (
+                        not isinstance(element, DocItem)
+                        or not element.prov
+                        or not isinstance(prov := element.prov[0], ProvenanceItem)
+                    ):
                         continue
                     if (
                         isinstance(element, PictureItem)
                         and self.pipeline_options.generate_picture_images
                     ):
-                        page_no = element.prov[0].page_no
+                        page_no = prov.page_no
                         page = page_map.get(page_no)
                         if page is None:
                             _log.warning(
@@ -387,10 +390,8 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                         assert page.size is not None
                         assert page.image is not None
 
-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
+                        crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin(
+                            page_height=page.size.height * scale
                         )
 
                         cropped_im = page.image.crop(crop_bbox.as_tuple())
diff --git a/docling/models/base_model.py b/docling/models/base_model.py
index c69b5018b..dae4ee92d 100644
--- a/docling/models/base_model.py
+++ b/docling/models/base_model.py
@@ -10,6 +10,7 @@
     DoclingDocument,
     NodeItem,
     PictureItem,
+    ProvenanceItem,
 )
 from PIL.Image import Image
 from typing_extensions import TypeVar
@@ -199,6 +200,8 @@ def prepare_element(
                 return None
 
         # Crop the image form the page
+        if not isinstance(element.prov[0], ProvenanceItem):
+            return None
         element_prov = element.prov[0]
         bbox = element_prov.bbox
         width = bbox.r - bbox.l
diff --git a/docling/models/picture_description_base_model.py b/docling/models/picture_description_base_model.py
index 055c74b1f..3643bd9ff 100644
--- a/docling/models/picture_description_base_model.py
+++ b/docling/models/picture_description_base_model.py
@@ -7,6 +7,7 @@
     DoclingDocument,
     NodeItem,
     PictureItem,
+    ProvenanceItem,
 )
 from docling_core.types.doc.document import (  # TODO: move import to docling_core.types.doc
     PictureDescriptionData,
@@ -64,8 +65,8 @@ def __call__(
             assert isinstance(el.item, PictureItem)
             describe_image = True
             # Don't describe the image if it's smaller than the threshold
-            if len(el.item.prov) > 0:
-                prov = el.item.prov[0]  # PictureItems have at most a single provenance
+            if el.item.prov and isinstance(prov := el.item.prov[0], ProvenanceItem):
+                # PictureItems have at most a single provenance
                 page = doc.pages.get(prov.page_no)
                 if page is not None:
                     page_area = page.size.width * page.size.height
diff --git a/docling/pipeline/asr_pipeline.py b/docling/pipeline/asr_pipeline.py
index 2bb94e42a..8b2f47092 100644
--- a/docling/pipeline/asr_pipeline.py
+++ b/docling/pipeline/asr_pipeline.py
@@ -1,47 +1,35 @@
 import logging
-import os
-import re
 import sys
 import tempfile
 from io import BytesIO
 from pathlib import Path
-from typing import TYPE_CHECKING, List, Optional, Union, cast
-
-from docling_core.types.doc import DoclingDocument, DocumentOrigin
-
-# import whisper  # type: ignore
-# import librosa
-# import numpy as np
-# import soundfile as sf  # type: ignore
-from docling_core.types.doc.labels import DocItemLabel
-from pydantic import BaseModel, Field, validator
+from typing import Optional, Union
+
+from docling_core.types.doc import (
+    ContentLayer,
+    DocItemLabel,
+    DoclingDocument,
+    DocumentOrigin,
+    ProvenanceTrack,
+)
+from pydantic import BaseModel, Field
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.noop_backend import NoOpBackend
-
-# from pydub import AudioSegment  # type: ignore
-# from transformers import WhisperForConditionalGeneration, WhisperProcessor, pipeline
 from docling.datamodel.accelerator_options import (
     AcceleratorOptions,
 )
 from docling.datamodel.base_models import (
     ConversionStatus,
-    FormatToMimeType,
 )
-from docling.datamodel.document import ConversionResult, InputDocument
+from docling.datamodel.document import ConversionResult
 from docling.datamodel.pipeline_options import (
     AsrPipelineOptions,
 )
 from docling.datamodel.pipeline_options_asr_model import (
     InlineAsrMlxWhisperOptions,
     InlineAsrNativeWhisperOptions,
-    # AsrResponseFormat,
-    InlineAsrOptions,
 )
-from docling.datamodel.pipeline_options_vlm_model import (
-    InferenceFramework,
-)
-from docling.datamodel.settings import settings
 from docling.pipeline.base_pipeline import BasePipeline
 from docling.utils.accelerator_utils import decide_device
 from docling.utils.profiling import ProfilingScope, TimeRecorder
@@ -190,8 +178,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
             )
 
             for citem in conversation:
+                prov: ProvenanceTrack = ProvenanceTrack(
+                    start_time=citem.start_time,
+                    end_time=citem.end_time,
+                    voice=citem.speaker,
+                )
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.text,
+                    prov=prov,
+                    content_layer=ContentLayer.BODY,
                 )
 
             return conv_res
@@ -299,8 +295,16 @@ def run(self, conv_res: ConversionResult) -> ConversionResult:
             )
 
             for citem in conversation:
+                prov: ProvenanceTrack = ProvenanceTrack(
+                    start_time=citem.start_time,
+                    end_time=citem.end_time,
+                    voice=citem.speaker,
+                )
                 conv_res.document.add_text(
-                    label=DocItemLabel.TEXT, text=citem.to_string()
+                    label=DocItemLabel.TEXT,
+                    text=citem.text,
+                    prov=prov,
+                    content_layer=ContentLayer.BODY,
                 )
 
             conv_res.status = ConversionStatus.SUCCESS
diff --git a/docling/pipeline/legacy_standard_pdf_pipeline.py b/docling/pipeline/legacy_standard_pdf_pipeline.py
index 55c2703cd..ceca82db9 100644
--- a/docling/pipeline/legacy_standard_pdf_pipeline.py
+++ b/docling/pipeline/legacy_standard_pdf_pipeline.py
@@ -4,7 +4,13 @@
 from typing import Optional, cast
 
 import numpy as np
-from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+from docling_core.types.doc import (
+    DocItem,
+    ImageRef,
+    PictureItem,
+    ProvenanceItem,
+    TableItem,
+)
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
@@ -181,7 +187,11 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                 ):
                     scale = self.pipeline_options.images_scale
                     for element, _level in conv_res.document.iterate_items():
-                        if not isinstance(element, DocItem) or len(element.prov) == 0:
+                        if (
+                            not isinstance(element, DocItem)
+                            or not element.prov
+                            or not isinstance(prov := element.prov[0], ProvenanceItem)
+                        ):
                             continue
                         if (
                             isinstance(element, PictureItem)
@@ -190,7 +200,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                             isinstance(element, TableItem)
                             and self.pipeline_options.generate_table_images
                         ):
-                            page_ix = element.prov[0].page_no - 1
+                            page_ix = prov.page_no - 1
                             page = next(
                                 (p for p in conv_res.pages if p.page_no == page_ix),
                                 cast("Page", None),
@@ -199,13 +209,9 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                             assert page.size is not None
                             assert page.image is not None
 
-                            crop_bbox = (
-                                element.prov[0]
-                                .bbox.scaled(scale=scale)
-                                .to_top_left_origin(
-                                    page_height=page.size.height * scale
-                                )
-                            )
+                            crop_bbox = prov.bbox.scaled(
+                                scale=scale
+                            ).to_top_left_origin(page_height=page.size.height * scale)
 
                             cropped_im = page.image.crop(crop_bbox.as_tuple())
                             element.image = ImageRef.from_pil(
diff --git a/docling/pipeline/standard_pdf_pipeline.py b/docling/pipeline/standard_pdf_pipeline.py
index 585c548c6..54def080e 100644
--- a/docling/pipeline/standard_pdf_pipeline.py
+++ b/docling/pipeline/standard_pdf_pipeline.py
@@ -26,7 +26,13 @@
 from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, cast
 
 import numpy as np
-from docling_core.types.doc import DocItem, ImageRef, PictureItem, TableItem
+from docling_core.types.doc import (
+    DocItem,
+    ImageRef,
+    PictureItem,
+    ProvenanceItem,
+    TableItem,
+)
 
 from docling.backend.abstract_backend import AbstractDocumentBackend
 from docling.backend.pdf_backend import PdfDocumentBackend
@@ -760,7 +766,11 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                 ):
                     scale = self.pipeline_options.images_scale
                     for element, _level in conv_res.document.iterate_items():
-                        if not isinstance(element, DocItem) or len(element.prov) == 0:
+                        if (
+                            not isinstance(element, DocItem)
+                            or not element.prov
+                            or not isinstance(prov := element.prov[0], ProvenanceItem)
+                        ):
                             continue
                         if (
                             isinstance(element, PictureItem)
@@ -769,7 +779,7 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                             isinstance(element, TableItem)
                             and self.pipeline_options.generate_table_images
                         ):
-                            page_ix = element.prov[0].page_no - 1
+                            page_ix = prov.page_no - 1
                             page = next(
                                 (p for p in conv_res.pages if p.page_no == page_ix),
                                 cast("Page", None),
@@ -778,13 +788,9 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
                             assert page.size is not None
                             assert page.image is not None
 
-                            crop_bbox = (
-                                element.prov[0]
-                                .bbox.scaled(scale=scale)
-                                .to_top_left_origin(
-                                    page_height=page.size.height * scale
-                                )
-                            )
+                            crop_bbox = prov.bbox.scaled(
+                                scale=scale
+                            ).to_top_left_origin(page_height=page.size.height * scale)
 
                             cropped_im = page.image.crop(crop_bbox.as_tuple())
                             element.image = ImageRef.from_pil(
diff --git a/docling/pipeline/vlm_pipeline.py b/docling/pipeline/vlm_pipeline.py
index ab919c4d9..73831fc49 100644
--- a/docling/pipeline/vlm_pipeline.py
+++ b/docling/pipeline/vlm_pipeline.py
@@ -165,21 +165,23 @@ def _assemble_document(self, conv_res: ConversionResult) -> ConversionResult:
             if self.pipeline_options.generate_picture_images:
                 scale = self.pipeline_options.images_scale
                 for element, _level in conv_res.document.iterate_items():
-                    if not isinstance(element, DocItem) or len(element.prov) == 0:
+                    if (
+                        not isinstance(element, DocItem)
+                        or not element.prov
+                        or not isinstance(prov := element.prov[0], ProvenanceItem)
+                    ):
                         continue
                     if (
                         isinstance(element, PictureItem)
                         and self.pipeline_options.generate_picture_images
                     ):
-                        page_ix = element.prov[0].page_no - 1
+                        page_ix = prov.page_no - 1
                         page = conv_res.pages[page_ix]
                         assert page.size is not None
                         assert page.image is not None
 
-                        crop_bbox = (
-                            element.prov[0]
-                            .bbox.scaled(scale=scale)
-                            .to_top_left_origin(page_height=page.size.height * scale)
+                        crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin(
+                            page_height=page.size.height * scale
                         )
 
                         cropped_im = page.image.crop(crop_bbox.as_tuple())
@@ -216,12 +218,14 @@ def _turn_dt_into_doc(self, conv_res) -> DoclingDocument:
             if self.force_backend_text:
                 scale = self.pipeline_options.images_scale
                 for element, _level in conv_res.document.iterate_items():
-                    if not isinstance(element, TextItem) or len(element.prov) == 0:
+                    if (
+                        not isinstance(element, TextItem)
+                        or not element.prov
+                        or not isinstance(prov := element.prov[0], ProvenanceItem)
+                    ):
                         continue
-                    crop_bbox = (
-                        element.prov[0]
-                        .bbox.scaled(scale=scale)
-                        .to_top_left_origin(page_height=page.size.height * scale)
+                    crop_bbox = prov.bbox.scaled(scale=scale).to_top_left_origin(
+                        page_height=page.size.height * scale
                     )
                     txt = self.extract_text_from_backend(page, crop_bbox)
                     element.text = txt
diff --git a/pyproject.toml b/pyproject.toml
index 8dc239382..8444ac4b6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -45,7 +45,6 @@ authors = [
 requires-python = '>=3.9,<4.0'
 dependencies = [
   'pydantic (>=2.0.0,<3.0.0)',
-  'docling-core[chunking] (>=2.50.1,<3.0.0)',
   'docling-parse (>=4.7.0,<5.0.0)',
   "docling-ibm-models>=3.9.1,<4",
   'filetype (>=1.2.0,<2.0.0)',
@@ -74,6 +73,7 @@ dependencies = [
   # 'scipy (>=1.6.0,<1.14.0) ; python_version < "3.10"',
   "accelerate>=1.0.0,<2",
   "polyfactory>=2.22.2",
+  "docling-core[chunking]",
 ]
 
 [project.urls]
@@ -160,6 +160,9 @@ constraints = [
 package = true
 default-groups = "all"
 
+[tool.uv.sources]
+docling-core = { git = "ssh://git@github.com/docling-project/docling-core.git", rev = "c75516516358f25add2682674fc7dc6eef2c5164" }
+
 [tool.setuptools.packages.find]
 include = ["docling*"]
 
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
index d7840e994..db52ba1b7 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.itxt
@@ -1,66 +1,14 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 00:11.000 --> 00:13.000
-    item-3 at level 2: inline: group WebVTT cue voice span
-      item-4 at level 3: text: Roger Bingham: 
-      item-5 at level 3: text: We are in New York City
-  item-6 at level 1: section: group WebVTT cue block
-    item-7 at level 2: text: 00:13.000 --> 00:16.000
-    item-8 at level 2: inline: group WebVTT cue voice span
-      item-9 at level 3: text: Roger Bingham: 
-      item-10 at level 3: text: We’re actually at the Lucern Hotel, just down the street
-  item-11 at level 1: section: group WebVTT cue block
-    item-12 at level 2: text: 00:16.000 --> 00:18.000
-    item-13 at level 2: inline: group WebVTT cue voice span
-      item-14 at level 3: text: Roger Bingham: 
-      item-15 at level 3: text: from the American Museum of Natural History
-  item-16 at level 1: section: group WebVTT cue block
-    item-17 at level 2: text: 00:18.000 --> 00:20.000
-    item-18 at level 2: inline: group WebVTT cue voice span
-      item-19 at level 3: text: Roger Bingham: 
-      item-20 at level 3: text: And with me is Neil deGrasse Tyson
-  item-21 at level 1: section: group WebVTT cue block
-    item-22 at level 2: text: 00:20.000 --> 00:22.000
-    item-23 at level 2: inline: group WebVTT cue voice span
-      item-24 at level 3: text: Roger Bingham: 
-      item-25 at level 3: text: Astrophysicist, Director of the Hayden Planetarium
-  item-26 at level 1: section: group WebVTT cue block
-    item-27 at level 2: text: 00:22.000 --> 00:24.000
-    item-28 at level 2: inline: group WebVTT cue voice span
-      item-29 at level 3: text: Roger Bingham: 
-      item-30 at level 3: text: at the AMNH.
-  item-31 at level 1: section: group WebVTT cue block
-    item-32 at level 2: text: 00:24.000 --> 00:26.000
-    item-33 at level 2: inline: group WebVTT cue voice span
-      item-34 at level 3: text: Roger Bingham: 
-      item-35 at level 3: text: Thank you for walking down here.
-  item-36 at level 1: section: group WebVTT cue block
-    item-37 at level 2: text: 00:27.000 --> 00:30.000
-    item-38 at level 2: inline: group WebVTT cue voice span
-      item-39 at level 3: text: Roger Bingham: 
-      item-40 at level 3: text: And I want to do a follow-up on the last conversation we did.
-  item-41 at level 1: section: group WebVTT cue block
-    item-42 at level 2: text: 00:30.000 --> 00:31.500
-    item-43 at level 2: inline: group WebVTT cue voice span
-      item-44 at level 3: text: Roger Bingham: 
-      item-45 at level 3: text: When we e-mailed—
-  item-46 at level 1: section: group WebVTT cue block
-    item-47 at level 2: text: 00:30.500 --> 00:32.500
-    item-48 at level 2: inline: group WebVTT cue voice span
-      item-49 at level 3: text: Neil deGrasse Tyson: 
-      item-50 at level 3: text: Didn’t we talk about enough in that conversation?
-  item-51 at level 1: section: group WebVTT cue block
-    item-52 at level 2: text: 00:32.000 --> 00:35.500
-    item-53 at level 2: inline: group WebVTT cue voice span
-      item-54 at level 3: text: Roger Bingham: 
-      item-55 at level 3: text: No! No no no no; 'cos 'cos obviously 'cos
-  item-56 at level 1: section: group WebVTT cue block
-    item-57 at level 2: text: 00:32.500 --> 00:33.500
-    item-58 at level 2: inline: group WebVTT cue voice span
-      item-59 at level 3: text: Neil deGrasse Tyson: 
-      item-60 at level 3: text: Laughs
-  item-61 at level 1: section: group WebVTT cue block
-    item-62 at level 2: text: 00:35.500 --> 00:38.000
-    item-63 at level 2: inline: group WebVTT cue voice span
-      item-64 at level 3: text: Roger Bingham: 
-      item-65 at level 3: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+  item-1 at level 1: text: We are in New York City
+  item-2 at level 1: text: We’re actually at the Lucern Hotel, just down the street
+  item-3 at level 1: text: from the American Museum of Natural History
+  item-4 at level 1: text: And with me is Neil deGrasse Tyson
+  item-5 at level 1: text: Astrophysicist, Director of the Hayden Planetarium
+  item-6 at level 1: text: at the AMNH.
+  item-7 at level 1: text: Thank you for walking down here.
+  item-8 at level 1: text: And I want to do a follow-up on the last conversation we did.
+  item-9 at level 1: text: When we e-mailed—
+  item-10 at level 1: text: Didn’t we talk about enough in that conversation?
+  item-11 at level 1: text: No! No no no no; 'cos 'cos obviously 'cos
+  item-12 at level 1: text: Laughs
+  item-13 at level 1: text: You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
index 831182560..5a7c9d29b 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "name": "webvtt_example_01",
   "origin": {
     "mimetype": "text/vtt",
@@ -18,1052 +18,291 @@
     "self_ref": "#/body",
     "children": [
       {
-        "$ref": "#/groups/0"
+        "$ref": "#/texts/0"
       },
       {
-        "$ref": "#/groups/2"
+        "$ref": "#/texts/1"
       },
       {
-        "$ref": "#/groups/4"
+        "$ref": "#/texts/2"
       },
       {
-        "$ref": "#/groups/6"
+        "$ref": "#/texts/3"
       },
       {
-        "$ref": "#/groups/8"
+        "$ref": "#/texts/4"
       },
       {
-        "$ref": "#/groups/10"
+        "$ref": "#/texts/5"
       },
       {
-        "$ref": "#/groups/12"
+        "$ref": "#/texts/6"
       },
       {
-        "$ref": "#/groups/14"
+        "$ref": "#/texts/7"
       },
       {
-        "$ref": "#/groups/16"
+        "$ref": "#/texts/8"
       },
       {
-        "$ref": "#/groups/18"
+        "$ref": "#/texts/9"
       },
       {
-        "$ref": "#/groups/20"
+        "$ref": "#/texts/10"
       },
       {
-        "$ref": "#/groups/22"
+        "$ref": "#/texts/11"
       },
       {
-        "$ref": "#/groups/24"
+        "$ref": "#/texts/12"
       }
     ],
     "content_layer": "body",
     "name": "_root_",
     "label": "unspecified"
   },
-  "groups": [
-    {
-      "self_ref": "#/groups/0",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/0"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/1"
-        },
-        {
-          "$ref": "#/texts/2"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
+  "groups": [],
+  "texts": [
     {
-      "self_ref": "#/groups/2",
+      "self_ref": "#/texts/0",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/3"
-        },
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/4"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/5"
+          "start_time": 11.0,
+          "end_time": 13.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "We are in New York City",
+      "text": "We are in New York City"
     },
     {
-      "self_ref": "#/groups/4",
+      "self_ref": "#/texts/1",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/6"
-        },
-        {
-          "$ref": "#/groups/5"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/5",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/7"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/8"
+          "start_time": 13.0,
+          "end_time": 16.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "We’re actually at the Lucern Hotel, just down the street",
+      "text": "We’re actually at the Lucern Hotel, just down the street"
     },
     {
-      "self_ref": "#/groups/6",
+      "self_ref": "#/texts/2",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/9"
-        },
-        {
-          "$ref": "#/groups/7"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/7",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/10"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/11"
+          "start_time": 16.0,
+          "end_time": 18.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "from the American Museum of Natural History",
+      "text": "from the American Museum of Natural History"
     },
     {
-      "self_ref": "#/groups/8",
+      "self_ref": "#/texts/3",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/12"
-        },
-        {
-          "$ref": "#/groups/9"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/9",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/13"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/14"
+          "start_time": 18.0,
+          "end_time": 20.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "And with me is Neil deGrasse Tyson",
+      "text": "And with me is Neil deGrasse Tyson"
     },
     {
-      "self_ref": "#/groups/10",
+      "self_ref": "#/texts/4",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/15"
-        },
-        {
-          "$ref": "#/groups/11"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/11",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/16"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/17"
+          "start_time": 20.0,
+          "end_time": 22.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Astrophysicist, Director of the Hayden Planetarium",
+      "text": "Astrophysicist, Director of the Hayden Planetarium"
     },
     {
-      "self_ref": "#/groups/12",
+      "self_ref": "#/texts/5",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/18"
-        },
-        {
-          "$ref": "#/groups/13"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/13",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/19"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/20"
+          "start_time": 22.0,
+          "end_time": 24.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "at the AMNH.",
+      "text": "at the AMNH."
     },
     {
-      "self_ref": "#/groups/14",
+      "self_ref": "#/texts/6",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/21"
-        },
-        {
-          "$ref": "#/groups/15"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/15",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/22"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/23"
+          "start_time": 24.0,
+          "end_time": 26.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Thank you for walking down here.",
+      "text": "Thank you for walking down here."
     },
     {
-      "self_ref": "#/groups/16",
+      "self_ref": "#/texts/7",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/24"
-        },
-        {
-          "$ref": "#/groups/17"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/17",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/25"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/26"
+          "start_time": 27.0,
+          "end_time": 30.0,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "And I want to do a follow-up on the last conversation we did.",
+      "text": "And I want to do a follow-up on the last conversation we did."
     },
     {
-      "self_ref": "#/groups/18",
+      "self_ref": "#/texts/8",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/27"
-        },
-        {
-          "$ref": "#/groups/19"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/19",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/28"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/29"
+          "start_time": 30.0,
+          "end_time": 31.5,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "When we e-mailed—",
+      "text": "When we e-mailed—"
     },
     {
-      "self_ref": "#/groups/20",
+      "self_ref": "#/texts/9",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/30"
-        },
-        {
-          "$ref": "#/groups/21"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/21",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/31"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/32"
+          "start_time": 30.5,
+          "end_time": 32.5,
+          "voice": "Neil deGrasse Tyson"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Didn’t we talk about enough in that conversation?",
+      "text": "Didn’t we talk about enough in that conversation?"
     },
     {
-      "self_ref": "#/groups/22",
+      "self_ref": "#/texts/10",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/33"
-        },
-        {
-          "$ref": "#/groups/23"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/23",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/34"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/35"
+          "start_time": 32.0,
+          "end_time": 35.5,
+          "voice": "Roger Bingham"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
+      "text": "No! No no no no; 'cos 'cos obviously 'cos"
     },
     {
-      "self_ref": "#/groups/24",
+      "self_ref": "#/texts/11",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/36"
-        },
-        {
-          "$ref": "#/groups/25"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/25",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/37"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/38"
+          "start_time": 32.5,
+          "end_time": 33.5,
+          "voice": "Neil deGrasse Tyson"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    }
-  ],
-  "texts": [
-    {
-      "self_ref": "#/texts/0",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:11.000 --> 00:13.000",
-      "text": "00:11.000 --> 00:13.000"
-    },
-    {
-      "self_ref": "#/texts/1",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/2",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "We are in New York City",
-      "text": "We are in New York City",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:13.000 --> 00:16.000",
-      "text": "00:13.000 --> 00:16.000"
-    },
-    {
-      "self_ref": "#/texts/4",
-      "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/5",
-      "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "We’re actually at the Lucern Hotel, just down the street",
-      "text": "We’re actually at the Lucern Hotel, just down the street",
+      "orig": "Laughs",
+      "text": "Laughs",
       "formatting": {
         "bold": false,
-        "italic": false,
+        "italic": true,
         "underline": false,
         "strikethrough": false,
         "script": "baseline"
       }
     },
     {
-      "self_ref": "#/texts/6",
+      "self_ref": "#/texts/12",
       "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "00:16.000 --> 00:18.000",
-      "text": "00:16.000 --> 00:18.000"
-    },
-    {
-      "self_ref": "#/texts/7",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/8",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "from the American Museum of Natural History",
-      "text": "from the American Museum of Natural History",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/9",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:18.000 --> 00:20.000",
-      "text": "00:18.000 --> 00:20.000"
-    },
-    {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "And with me is Neil deGrasse Tyson",
-      "text": "And with me is Neil deGrasse Tyson",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/12",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:20.000 --> 00:22.000",
-      "text": "00:20.000 --> 00:22.000"
-    },
-    {
-      "self_ref": "#/texts/13",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/14",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Astrophysicist, Director of the Hayden Planetarium",
-      "text": "Astrophysicist, Director of the Hayden Planetarium",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/15",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:22.000 --> 00:24.000",
-      "text": "00:22.000 --> 00:24.000"
-    },
-    {
-      "self_ref": "#/texts/16",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/17",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "at the AMNH.",
-      "text": "at the AMNH.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/18",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:24.000 --> 00:26.000",
-      "text": "00:24.000 --> 00:26.000"
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Thank you for walking down here.",
-      "text": "Thank you for walking down here.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:27.000 --> 00:30.000",
-      "text": "00:27.000 --> 00:30.000"
-    },
-    {
-      "self_ref": "#/texts/22",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/23",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "And I want to do a follow-up on the last conversation we did.",
-      "text": "And I want to do a follow-up on the last conversation we did.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/24",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:30.000 --> 00:31.500",
-      "text": "00:30.000 --> 00:31.500"
-    },
-    {
-      "self_ref": "#/texts/25",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/26",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "When we e-mailed—",
-      "text": "When we e-mailed—",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/27",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:30.500 --> 00:32.500",
-      "text": "00:30.500 --> 00:32.500"
-    },
-    {
-      "self_ref": "#/texts/28",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Neil deGrasse Tyson: ",
-      "text": "Neil deGrasse Tyson: "
-    },
-    {
-      "self_ref": "#/texts/29",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Didn’t we talk about enough in that conversation?",
-      "text": "Didn’t we talk about enough in that conversation?",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/30",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:32.000 --> 00:35.500",
-      "text": "00:32.000 --> 00:35.500"
-    },
-    {
-      "self_ref": "#/texts/31",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/32",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "No! No no no no; 'cos 'cos obviously 'cos",
-      "text": "No! No no no no; 'cos 'cos obviously 'cos",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/33",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:32.500 --> 00:33.500",
-      "text": "00:32.500 --> 00:33.500"
-    },
-    {
-      "self_ref": "#/texts/34",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Neil deGrasse Tyson: ",
-      "text": "Neil deGrasse Tyson: "
-    },
-    {
-      "self_ref": "#/texts/35",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Laughs",
-      "text": "Laughs",
-      "formatting": {
-        "bold": false,
-        "italic": true,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/36",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:35.500 --> 00:38.000",
-      "text": "00:35.500 --> 00:38.000"
-    },
-    {
-      "self_ref": "#/texts/37",
-      "parent": {
-        "$ref": "#/groups/25"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Roger Bingham: ",
-      "text": "Roger Bingham: "
-    },
-    {
-      "self_ref": "#/texts/38",
-      "parent": {
-        "$ref": "#/groups/25"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
+      "prov": [
+        {
+          "start_time": 35.5,
+          "end_time": 38.0,
+          "voice": "Roger Bingham"
+        }
+      ],
       "orig": "You know I’m so excited my glasses are falling off here.",
-      "text": "You know I’m so excited my glasses are falling off here.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "text": "You know I’m so excited my glasses are falling off here."
     }
   ],
   "pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
index c57670289..95d9e6575 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_01.vtt.md
@@ -1,51 +1,25 @@
-00:11.000 --> 00:13.000
+We are in New York City
 
-Roger Bingham:  We are in New York City
+We’re actually at the Lucern Hotel, just down the street
 
-00:13.000 --> 00:16.000
+from the American Museum of Natural History
 
-Roger Bingham:  We’re actually at the Lucern Hotel, just down the street
+And with me is Neil deGrasse Tyson
 
-00:16.000 --> 00:18.000
+Astrophysicist, Director of the Hayden Planetarium
 
-Roger Bingham:  from the American Museum of Natural History
+at the AMNH.
 
-00:18.000 --> 00:20.000
+Thank you for walking down here.
 
-Roger Bingham:  And with me is Neil deGrasse Tyson
+And I want to do a follow-up on the last conversation we did.
 
-00:20.000 --> 00:22.000
+When we e-mailed—
 
-Roger Bingham:  Astrophysicist, Director of the Hayden Planetarium
+Didn’t we talk about enough in that conversation?
 
-00:22.000 --> 00:24.000
+No! No no no no; 'cos 'cos obviously 'cos
 
-Roger Bingham:  at the AMNH.
+*Laughs*
 
-00:24.000 --> 00:26.000
-
-Roger Bingham:  Thank you for walking down here.
-
-00:27.000 --> 00:30.000
-
-Roger Bingham:  And I want to do a follow-up on the last conversation we did.
-
-00:30.000 --> 00:31.500
-
-Roger Bingham:  When we e-mailed—
-
-00:30.500 --> 00:32.500
-
-Neil deGrasse Tyson:  Didn’t we talk about enough in that conversation?
-
-00:32.000 --> 00:35.500
-
-Roger Bingham:  No! No no no no; 'cos 'cos obviously 'cos
-
-00:32.500 --> 00:33.500
-
-Neil deGrasse Tyson:  *Laughs*
-
-00:35.500 --> 00:38.000
-
-Roger Bingham:  You know I’m so excited my glasses are falling off here.
\ No newline at end of file
+You know I’m so excited my glasses are falling off here.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
index 6d90404ff..56f63bc3f 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.itxt
@@ -1,22 +1,12 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 00:00.000 --> 00:02.000
-    item-3 at level 2: inline: group WebVTT cue voice span
-      item-4 at level 3: text: Esme (first, loud): 
-      item-5 at level 3: text: It’s a blue apple tree!
-  item-6 at level 1: section: group WebVTT cue block
-    item-7 at level 2: text: 00:02.000 --> 00:04.000
-    item-8 at level 2: inline: group WebVTT cue voice span
-      item-9 at level 3: text: Mary: 
-      item-10 at level 3: text: No way!
-  item-11 at level 1: section: group WebVTT cue block
-    item-12 at level 2: text: 00:04.000 --> 00:06.000
-    item-13 at level 2: inline: group WebVTT cue voice span
-      item-14 at level 3: text: Esme: 
-      item-15 at level 3: text: Hee!
-    item-16 at level 2: text: laughter
-  item-17 at level 1: section: group WebVTT cue block
-    item-18 at level 2: text: 00:06.000 --> 00:08.000
-    item-19 at level 2: inline: group WebVTT cue voice span
-      item-20 at level 3: text: Mary (loud): 
-      item-21 at level 3: text: That’s awesome!
\ No newline at end of file
+  item-1 at level 1: text: It’s a blue apple tree!
+  item-2 at level 1: text: No way!
+  item-3 at level 1: inline: group WebVTT cue span
+    item-4 at level 2: text: Hee!
+    item-5 at level 2: text:  
+    item-6 at level 2: text: laughter
+  item-7 at level 1: text: That’s awesome!
+  item-8 at level 1: inline: group WebVTT cue span
+    item-9 at level 2: text: Sur les 
+    item-10 at level 2: text: playground
+    item-11 at level 2: text: , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
index 72647d93d..67a95ef50 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.json
@@ -1,10 +1,10 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "name": "webvtt_example_02",
   "origin": {
     "mimetype": "text/vtt",
-    "binary_hash": 5029965721282070624,
+    "binary_hash": 8584853280299071027,
     "filename": "webvtt_example_02.vtt"
   },
   "furniture": {
@@ -18,16 +18,19 @@
     "self_ref": "#/body",
     "children": [
       {
-        "$ref": "#/groups/0"
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/texts/1"
       },
       {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/0"
       },
       {
-        "$ref": "#/groups/4"
+        "$ref": "#/texts/5"
       },
       {
-        "$ref": "#/groups/6"
+        "$ref": "#/groups/1"
       }
     ],
     "content_layer": "body",
@@ -41,70 +44,22 @@
         "$ref": "#/body"
       },
       "children": [
-        {
-          "$ref": "#/texts/0"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/1"
-        },
         {
           "$ref": "#/texts/2"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/2",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
+        },
         {
           "$ref": "#/texts/3"
         },
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [
         {
           "$ref": "#/texts/4"
-        },
-        {
-          "$ref": "#/texts/5"
         }
       ],
       "content_layer": "body",
-      "name": "WebVTT cue voice span",
+      "name": "WebVTT cue span",
       "label": "inline"
     },
     {
-      "self_ref": "#/groups/4",
+      "self_ref": "#/groups/1",
       "parent": {
         "$ref": "#/body"
       },
@@ -112,23 +67,6 @@
         {
           "$ref": "#/texts/6"
         },
-        {
-          "$ref": "#/groups/5"
-        },
-        {
-          "$ref": "#/texts/9"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/5",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [
         {
           "$ref": "#/texts/7"
         },
@@ -137,41 +75,7 @@
         }
       ],
       "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    },
-    {
-      "self_ref": "#/groups/6",
-      "parent": {
-        "$ref": "#/body"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/10"
-        },
-        {
-          "$ref": "#/groups/7"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/7",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/11"
-        },
-        {
-          "$ref": "#/texts/12"
-        }
-      ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
+      "name": "WebVTT cue span",
       "label": "inline"
     }
   ],
@@ -179,143 +83,161 @@
     {
       "self_ref": "#/texts/0",
       "parent": {
-        "$ref": "#/groups/0"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "00:00.000 --> 00:02.000",
-      "text": "00:00.000 --> 00:02.000"
+      "prov": [
+        {
+          "start_time": 0.0,
+          "end_time": 2.0,
+          "voice": "Esme",
+          "classes": [
+            "v.first.loud"
+          ]
+        }
+      ],
+      "orig": "It’s a blue apple tree!",
+      "text": "It’s a blue apple tree!"
     },
     {
       "self_ref": "#/texts/1",
       "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "Esme (first, loud): ",
-      "text": "Esme (first, loud): "
+      "prov": [
+        {
+          "start_time": 2.0,
+          "end_time": 4.0,
+          "voice": "Mary"
+        }
+      ],
+      "orig": "No way!",
+      "text": "No way!"
     },
     {
       "self_ref": "#/texts/2",
       "parent": {
-        "$ref": "#/groups/1"
+        "$ref": "#/groups/0"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "It’s a blue apple tree!",
-      "text": "It’s a blue apple tree!",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0,
+          "voice": "Esme"
+        }
+      ],
+      "orig": "Hee!",
+      "text": "Hee!"
     },
     {
       "self_ref": "#/texts/3",
       "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/groups/0"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "00:02.000 --> 00:04.000",
-      "text": "00:02.000 --> 00:04.000"
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": " ",
+      "text": " "
     },
     {
       "self_ref": "#/texts/4",
       "parent": {
-        "$ref": "#/groups/3"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Mary: ",
-      "text": "Mary: "
-    },
-    {
-      "self_ref": "#/texts/5",
-      "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/groups/0"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "No way!",
-      "text": "No way!",
+      "prov": [
+        {
+          "start_time": 4.0,
+          "end_time": 6.0
+        }
+      ],
+      "orig": "laughter",
+      "text": "laughter",
       "formatting": {
         "bold": false,
-        "italic": false,
+        "italic": true,
         "underline": false,
         "strikethrough": false,
         "script": "baseline"
       }
     },
     {
-      "self_ref": "#/texts/6",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:04.000 --> 00:06.000",
-      "text": "00:04.000 --> 00:06.000"
-    },
-    {
-      "self_ref": "#/texts/7",
+      "self_ref": "#/texts/5",
       "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "Esme: ",
-      "text": "Esme: "
+      "prov": [
+        {
+          "start_time": 6.0,
+          "end_time": 8.0,
+          "voice": "Mary",
+          "classes": [
+            "v.loud"
+          ]
+        }
+      ],
+      "orig": "That’s awesome!",
+      "text": "That’s awesome!"
     },
     {
-      "self_ref": "#/texts/8",
+      "self_ref": "#/texts/6",
       "parent": {
-        "$ref": "#/groups/5"
+        "$ref": "#/groups/1"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "Hee!",
-      "text": "Hee!",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": "Sur les ",
+      "text": "Sur les "
     },
     {
-      "self_ref": "#/texts/9",
+      "self_ref": "#/texts/7",
       "parent": {
-        "$ref": "#/groups/4"
+        "$ref": "#/groups/1"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "laughter",
-      "text": "laughter",
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0,
+          "languages": [
+            "en"
+          ],
+          "classes": [
+            "i.foreignphrase"
+          ]
+        }
+      ],
+      "orig": "playground",
+      "text": "playground",
       "formatting": {
         "bold": false,
         "italic": true,
@@ -325,47 +247,21 @@
       }
     },
     {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:06.000 --> 00:08.000",
-      "text": "00:06.000 --> 00:08.000"
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Mary (loud): ",
-      "text": "Mary (loud): "
-    },
-    {
-      "self_ref": "#/texts/12",
+      "self_ref": "#/texts/8",
       "parent": {
-        "$ref": "#/groups/7"
+        "$ref": "#/groups/1"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "That’s awesome!",
-      "text": "That’s awesome!",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "prov": [
+        {
+          "start_time": 8.0,
+          "end_time": 10.0
+        }
+      ],
+      "orig": ", ici à Montpellier",
+      "text": ", ici à Montpellier"
     }
   ],
   "pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
index db84cf116..5c6485f3a 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_02.vtt.md
@@ -1,17 +1,9 @@
-00:00.000 --> 00:02.000
+It’s a blue apple tree!
 
-Esme (first, loud):  It’s a blue apple tree!
+No way!
 
-00:02.000 --> 00:04.000
+Hee! *laughter*
 
-Mary:  No way!
+That’s awesome!
 
-00:04.000 --> 00:06.000
-
-Esme:  Hee!
-
-*laughter*
-
-00:06.000 --> 00:08.000
-
-Mary (loud):  That’s awesome!
\ No newline at end of file
+Sur les  *playground* , ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
index ca344e595..a46794123 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.itxt
@@ -1,77 +1,18 @@
 item-0 at level 0: unspecified: group _root_
-  item-1 at level 1: section: group WebVTT cue block
-    item-2 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
-    item-3 at level 2: text: 00:00:04.963 --> 00:00:08.571
-    item-4 at level 2: inline: group WebVTT cue voice span
-      item-5 at level 3: text: Speaker A: 
-      item-6 at level 3: text: OK, I think now we should be recording
-  item-7 at level 1: section: group WebVTT cue block
-    item-8 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
-    item-9 at level 2: text: 00:00:08.571 --> 00:00:09.403
-    item-10 at level 2: inline: group WebVTT cue voice span
-      item-11 at level 3: text: Speaker A: 
-      item-12 at level 3: text: properly.
-  item-13 at level 1: section: group WebVTT cue block
-    item-14 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
-    item-15 at level 2: text: 00:00:10.683 --> 00:00:11.563
-    item-16 at level 2: text: Good.
-  item-17 at level 1: section: group WebVTT cue block
-    item-18 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
-    item-19 at level 2: text: 00:00:13.363 --> 00:00:13.803
-    item-20 at level 2: inline: group WebVTT cue voice span
-      item-21 at level 3: text: Speaker A: 
-      item-22 at level 3: text: Yeah.
-  item-23 at level 1: section: group WebVTT cue block
-    item-24 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
-    item-25 at level 2: text: 00:00:49.603 --> 00:00:53.363
-    item-26 at level 2: inline: group WebVTT cue voice span
-      item-27 at level 3: text: Speaker B: 
-      item-28 at level 3: text: I was also thinking.
-  item-29 at level 1: section: group WebVTT cue block
-    item-30 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
-    item-31 at level 2: text: 00:00:54.963 --> 00:01:02.072
-    item-32 at level 2: inline: group WebVTT cue voice span
-      item-33 at level 3: text: Speaker B: 
-      item-34 at level 3: text: Would be maybe good to create items,
-  item-35 at level 1: section: group WebVTT cue block
-    item-36 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
-    item-37 at level 2: text: 00:01:02.072 --> 00:01:06.811
-    item-38 at level 2: inline: group WebVTT cue voice span
-      item-39 at level 3: text: Speaker B: 
-      item-40 at level 3: text: some metadata, some options that can be specific.
-  item-41 at level 1: section: group WebVTT cue block
-    item-42 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
-    item-43 at level 2: text: 00:01:10.243 --> 00:01:13.014
-    item-44 at level 2: inline: group WebVTT cue voice span
-      item-45 at level 3: text: Speaker A: 
-      item-46 at level 3: text: Yeah, I mean I think you went even more than
-  item-47 at level 1: section: group WebVTT cue block
-    item-48 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
-    item-49 at level 2: text: 00:01:10.563 --> 00:01:12.643
-    item-50 at level 2: inline: group WebVTT cue voice span
-      item-51 at level 3: text: Speaker B: 
-      item-52 at level 3: text: But we preserved the atoms.
-  item-53 at level 1: section: group WebVTT cue block
-    item-54 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
-    item-55 at level 2: text: 00:01:13.014 --> 00:01:15.907
-    item-56 at level 2: inline: group WebVTT cue voice span
-      item-57 at level 3: text: Speaker A: 
-      item-58 at level 3: text: than me. I just opened the format.
-  item-59 at level 1: section: group WebVTT cue block
-    item-60 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
-    item-61 at level 2: text: 00:01:50.222 --> 00:01:51.643
-    item-62 at level 2: inline: group WebVTT cue voice span
-      item-63 at level 3: text: Speaker A: 
-      item-64 at level 3: text: give it a try, yeah.
-  item-65 at level 1: section: group WebVTT cue block
-    item-66 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
-    item-67 at level 2: text: 00:01:52.043 --> 00:01:55.043
-    item-68 at level 2: inline: group WebVTT cue voice span
-      item-69 at level 3: text: Speaker B: 
-      item-70 at level 3: text: Okay, talk to you later.
-  item-71 at level 1: section: group WebVTT cue block
-    item-72 at level 2: text: 62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
-    item-73 at level 2: text: 00:01:54.603 --> 00:01:55.283
-    item-74 at level 2: inline: group WebVTT cue voice span
-      item-75 at level 3: text: Speaker A: 
-      item-76 at level 3: text: See you.
\ No newline at end of file
+  item-1 at level 1: text: OK,
+  item-2 at level 1: text: I think now we should be recording
+  item-3 at level 1: text: properly.
+  item-4 at level 1: text: Good.
+  item-5 at level 1: text: Yeah.
+  item-6 at level 1: text: I was also thinking.
+  item-7 at level 1: text: Would be maybe good to create items,
+  item-8 at level 1: text: some metadata,
+  item-9 at level 1: text: some options that can be specific.
+  item-10 at level 1: text: Yeah,
+  item-11 at level 1: text: I mean I think you went even more than
+  item-12 at level 1: text: But we preserved the atoms.
+  item-13 at level 1: text: than me.
+  item-14 at level 1: text: I just opened the format.
+  item-15 at level 1: text: give it a try, yeah.
+  item-16 at level 1: text: Okay, talk to you later.
+  item-17 at level 1: text: See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
index 5df08e2bf..dddce0f28 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.json
@@ -1,6 +1,6 @@
 {
   "schema_name": "DoclingDocument",
-  "version": "1.7.0",
+  "version": "1.8.0",
   "name": "webvtt_example_03",
   "origin": {
     "mimetype": "text/vtt",
@@ -18,1218 +18,384 @@
     "self_ref": "#/body",
     "children": [
       {
-        "$ref": "#/groups/0"
+        "$ref": "#/texts/0"
       },
       {
-        "$ref": "#/groups/2"
+        "$ref": "#/texts/1"
       },
       {
-        "$ref": "#/groups/4"
+        "$ref": "#/texts/2"
       },
       {
-        "$ref": "#/groups/5"
+        "$ref": "#/texts/3"
       },
       {
-        "$ref": "#/groups/7"
+        "$ref": "#/texts/4"
       },
       {
-        "$ref": "#/groups/9"
+        "$ref": "#/texts/5"
       },
       {
-        "$ref": "#/groups/11"
+        "$ref": "#/texts/6"
       },
       {
-        "$ref": "#/groups/13"
+        "$ref": "#/texts/7"
       },
       {
-        "$ref": "#/groups/15"
+        "$ref": "#/texts/8"
       },
       {
-        "$ref": "#/groups/17"
+        "$ref": "#/texts/9"
       },
       {
-        "$ref": "#/groups/19"
+        "$ref": "#/texts/10"
       },
       {
-        "$ref": "#/groups/21"
+        "$ref": "#/texts/11"
       },
       {
-        "$ref": "#/groups/23"
+        "$ref": "#/texts/12"
+      },
+      {
+        "$ref": "#/texts/13"
+      },
+      {
+        "$ref": "#/texts/14"
+      },
+      {
+        "$ref": "#/texts/15"
+      },
+      {
+        "$ref": "#/texts/16"
       }
     ],
     "content_layer": "body",
     "name": "_root_",
     "label": "unspecified"
   },
-  "groups": [
+  "groups": [],
+  "texts": [
     {
-      "self_ref": "#/groups/0",
+      "self_ref": "#/texts/0",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/0"
-        },
-        {
-          "$ref": "#/texts/1"
-        },
-        {
-          "$ref": "#/groups/1"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/2"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/3"
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "OK,",
+      "text": "OK,"
     },
     {
-      "self_ref": "#/groups/2",
+      "self_ref": "#/texts/1",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/4"
-        },
-        {
-          "$ref": "#/texts/5"
-        },
-        {
-          "$ref": "#/groups/3"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/3",
-      "parent": {
-        "$ref": "#/groups/2"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/6"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/7"
+          "start_time": 4.963,
+          "end_time": 8.571,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "I think now we should be recording",
+      "text": "I think now we should be recording"
     },
     {
-      "self_ref": "#/groups/4",
+      "self_ref": "#/texts/2",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/8"
-        },
-        {
-          "$ref": "#/texts/9"
-        },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/10"
+          "start_time": 8.571,
+          "end_time": 9.403,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
+      "orig": "properly.",
+      "text": "properly."
     },
     {
-      "self_ref": "#/groups/5",
+      "self_ref": "#/texts/3",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/11"
-        },
-        {
-          "$ref": "#/texts/12"
-        },
-        {
-          "$ref": "#/groups/6"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/6",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/13"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/14"
+          "start_time": 10.683,
+          "end_time": 11.563,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Good.",
+      "text": "Good."
     },
     {
-      "self_ref": "#/groups/7",
+      "self_ref": "#/texts/4",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/15"
-        },
-        {
-          "$ref": "#/texts/16"
-        },
-        {
-          "$ref": "#/groups/8"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/8",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/17"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/18"
+          "start_time": 13.363,
+          "end_time": 13.803,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Yeah.",
+      "text": "Yeah."
     },
     {
-      "self_ref": "#/groups/9",
+      "self_ref": "#/texts/5",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/19"
-        },
-        {
-          "$ref": "#/texts/20"
-        },
-        {
-          "$ref": "#/groups/10"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/10",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/21"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/22"
+          "start_time": 49.603,
+          "end_time": 53.363,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "I was also thinking.",
+      "text": "I was also thinking."
     },
     {
-      "self_ref": "#/groups/11",
+      "self_ref": "#/texts/6",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/23"
-        },
-        {
-          "$ref": "#/texts/24"
-        },
-        {
-          "$ref": "#/groups/12"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/12",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/25"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/26"
+          "start_time": 54.963,
+          "end_time": 62.072,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Would be maybe good to create items,",
+      "text": "Would be maybe good to create items,"
     },
     {
-      "self_ref": "#/groups/13",
+      "self_ref": "#/texts/7",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/27"
-        },
-        {
-          "$ref": "#/texts/28"
-        },
-        {
-          "$ref": "#/groups/14"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/14",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/29"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/30"
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "some metadata,",
+      "text": "some metadata,"
     },
     {
-      "self_ref": "#/groups/15",
+      "self_ref": "#/texts/8",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/31"
-        },
-        {
-          "$ref": "#/texts/32"
-        },
-        {
-          "$ref": "#/groups/16"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/16",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/33"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/34"
+          "start_time": 62.072,
+          "end_time": 66.811,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "some options that can be specific.",
+      "text": "some options that can be specific."
     },
     {
-      "self_ref": "#/groups/17",
+      "self_ref": "#/texts/9",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/35"
-        },
-        {
-          "$ref": "#/texts/36"
-        },
-        {
-          "$ref": "#/groups/18"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/18",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/37"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/38"
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "Yeah,",
+      "text": "Yeah,"
     },
     {
-      "self_ref": "#/groups/19",
+      "self_ref": "#/texts/10",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/39"
-        },
-        {
-          "$ref": "#/texts/40"
-        },
-        {
-          "$ref": "#/groups/20"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/20",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/41"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/42"
+          "start_time": 70.243,
+          "end_time": 73.014,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "I mean I think you went even more than",
+      "text": "I mean I think you went even more than"
     },
     {
-      "self_ref": "#/groups/21",
+      "self_ref": "#/texts/11",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/43"
-        },
-        {
-          "$ref": "#/texts/44"
-        },
-        {
-          "$ref": "#/groups/22"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/22",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/45"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/46"
+          "start_time": 70.563,
+          "end_time": 72.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
+          "voice": "Speaker B"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
+      "orig": "But we preserved the atoms.",
+      "text": "But we preserved the atoms."
     },
     {
-      "self_ref": "#/groups/23",
+      "self_ref": "#/texts/12",
       "parent": {
         "$ref": "#/body"
       },
-      "children": [
-        {
-          "$ref": "#/texts/47"
-        },
-        {
-          "$ref": "#/texts/48"
-        },
-        {
-          "$ref": "#/groups/24"
-        }
-      ],
+      "children": [],
       "content_layer": "body",
-      "name": "WebVTT cue block",
-      "label": "section"
-    },
-    {
-      "self_ref": "#/groups/24",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [
-        {
-          "$ref": "#/texts/49"
-        },
+      "label": "text",
+      "prov": [
         {
-          "$ref": "#/texts/50"
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
         }
       ],
-      "content_layer": "body",
-      "name": "WebVTT cue voice span",
-      "label": "inline"
-    }
-  ],
-  "texts": [
-    {
-      "self_ref": "#/texts/0",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
-    },
-    {
-      "self_ref": "#/texts/1",
-      "parent": {
-        "$ref": "#/groups/0"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:04.963 --> 00:00:08.571",
-      "text": "00:00:04.963 --> 00:00:08.571"
-    },
-    {
-      "self_ref": "#/texts/2",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/3",
-      "parent": {
-        "$ref": "#/groups/1"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "OK, I think now we should be recording",
-      "text": "OK, I think now we should be recording",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "orig": "than me.",
+      "text": "than me."
     },
     {
-      "self_ref": "#/texts/4",
+      "self_ref": "#/texts/13",
       "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1"
+      "prov": [
+        {
+          "start_time": 73.014,
+          "end_time": 75.907,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "I just opened the format.",
+      "text": "I just opened the format."
     },
     {
-      "self_ref": "#/texts/5",
+      "self_ref": "#/texts/14",
       "parent": {
-        "$ref": "#/groups/2"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "00:00:08.571 --> 00:00:09.403",
-      "text": "00:00:08.571 --> 00:00:09.403"
+      "prov": [
+        {
+          "start_time": 110.222,
+          "end_time": 111.643,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
+          "voice": "Speaker A"
+        }
+      ],
+      "orig": "give it a try, yeah.",
+      "text": "give it a try, yeah."
     },
     {
-      "self_ref": "#/texts/6",
+      "self_ref": "#/texts/15",
       "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
+      "prov": [
+        {
+          "start_time": 112.043,
+          "end_time": 115.043,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
+          "voice": "Speaker B"
+        }
+      ],
+      "orig": "Okay, talk to you later.",
+      "text": "Okay, talk to you later."
     },
     {
-      "self_ref": "#/texts/7",
+      "self_ref": "#/texts/16",
       "parent": {
-        "$ref": "#/groups/3"
+        "$ref": "#/body"
       },
       "children": [],
       "content_layer": "body",
       "label": "text",
-      "prov": [],
-      "orig": "properly.",
-      "text": "properly.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/8",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
-    },
-    {
-      "self_ref": "#/texts/9",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:10.683 --> 00:00:11.563",
-      "text": "00:00:10.683 --> 00:00:11.563"
-    },
-    {
-      "self_ref": "#/texts/10",
-      "parent": {
-        "$ref": "#/groups/4"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Good.",
-      "text": "Good.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/11",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0"
-    },
-    {
-      "self_ref": "#/texts/12",
-      "parent": {
-        "$ref": "#/groups/5"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:13.363 --> 00:00:13.803",
-      "text": "00:00:13.363 --> 00:00:13.803"
-    },
-    {
-      "self_ref": "#/texts/13",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/14",
-      "parent": {
-        "$ref": "#/groups/6"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Yeah.",
-      "text": "Yeah.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/15",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0"
-    },
-    {
-      "self_ref": "#/texts/16",
-      "parent": {
-        "$ref": "#/groups/7"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:49.603 --> 00:00:53.363",
-      "text": "00:00:49.603 --> 00:00:53.363"
-    },
-    {
-      "self_ref": "#/texts/17",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/18",
-      "parent": {
-        "$ref": "#/groups/8"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "I was also thinking.",
-      "text": "I was also thinking.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/19",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0"
-    },
-    {
-      "self_ref": "#/texts/20",
-      "parent": {
-        "$ref": "#/groups/9"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:00:54.963 --> 00:01:02.072",
-      "text": "00:00:54.963 --> 00:01:02.072"
-    },
-    {
-      "self_ref": "#/texts/21",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/22",
-      "parent": {
-        "$ref": "#/groups/10"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Would be maybe good to create items,",
-      "text": "Would be maybe good to create items,",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/23",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1"
-    },
-    {
-      "self_ref": "#/texts/24",
-      "parent": {
-        "$ref": "#/groups/11"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:02.072 --> 00:01:06.811",
-      "text": "00:01:02.072 --> 00:01:06.811"
-    },
-    {
-      "self_ref": "#/texts/25",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/26",
-      "parent": {
-        "$ref": "#/groups/12"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "some metadata, some options that can be specific.",
-      "text": "some metadata, some options that can be specific.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/27",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0"
-    },
-    {
-      "self_ref": "#/texts/28",
-      "parent": {
-        "$ref": "#/groups/13"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:10.243 --> 00:01:13.014",
-      "text": "00:01:10.243 --> 00:01:13.014"
-    },
-    {
-      "self_ref": "#/texts/29",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/30",
-      "parent": {
-        "$ref": "#/groups/14"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Yeah, I mean I think you went even more than",
-      "text": "Yeah, I mean I think you went even more than",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/31",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0"
-    },
-    {
-      "self_ref": "#/texts/32",
-      "parent": {
-        "$ref": "#/groups/15"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:10.563 --> 00:01:12.643",
-      "text": "00:01:10.563 --> 00:01:12.643"
-    },
-    {
-      "self_ref": "#/texts/33",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/34",
-      "parent": {
-        "$ref": "#/groups/16"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "But we preserved the atoms.",
-      "text": "But we preserved the atoms.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/35",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1"
-    },
-    {
-      "self_ref": "#/texts/36",
-      "parent": {
-        "$ref": "#/groups/17"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:13.014 --> 00:01:15.907",
-      "text": "00:01:13.014 --> 00:01:15.907"
-    },
-    {
-      "self_ref": "#/texts/37",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/38",
-      "parent": {
-        "$ref": "#/groups/18"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "than me. I just opened the format.",
-      "text": "than me. I just opened the format.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/39",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1"
-    },
-    {
-      "self_ref": "#/texts/40",
-      "parent": {
-        "$ref": "#/groups/19"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:50.222 --> 00:01:51.643",
-      "text": "00:01:50.222 --> 00:01:51.643"
-    },
-    {
-      "self_ref": "#/texts/41",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/42",
-      "parent": {
-        "$ref": "#/groups/20"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "give it a try, yeah.",
-      "text": "give it a try, yeah.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/43",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0"
-    },
-    {
-      "self_ref": "#/texts/44",
-      "parent": {
-        "$ref": "#/groups/21"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:52.043 --> 00:01:55.043",
-      "text": "00:01:52.043 --> 00:01:55.043"
-    },
-    {
-      "self_ref": "#/texts/45",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker B: ",
-      "text": "Speaker B: "
-    },
-    {
-      "self_ref": "#/texts/46",
-      "parent": {
-        "$ref": "#/groups/22"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Okay, talk to you later.",
-      "text": "Okay, talk to you later.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
-    },
-    {
-      "self_ref": "#/texts/47",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
-      "text": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0"
-    },
-    {
-      "self_ref": "#/texts/48",
-      "parent": {
-        "$ref": "#/groups/23"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "00:01:54.603 --> 00:01:55.283",
-      "text": "00:01:54.603 --> 00:01:55.283"
-    },
-    {
-      "self_ref": "#/texts/49",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
-      "orig": "Speaker A: ",
-      "text": "Speaker A: "
-    },
-    {
-      "self_ref": "#/texts/50",
-      "parent": {
-        "$ref": "#/groups/24"
-      },
-      "children": [],
-      "content_layer": "body",
-      "label": "text",
-      "prov": [],
+      "prov": [
+        {
+          "start_time": 114.603,
+          "end_time": 115.283,
+          "identifier": "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0",
+          "voice": "Speaker A"
+        }
+      ],
       "orig": "See you.",
-      "text": "See you.",
-      "formatting": {
-        "bold": false,
-        "italic": false,
-        "underline": false,
-        "strikethrough": false,
-        "script": "baseline"
-      }
+      "text": "See you."
     }
   ],
   "pictures": [],
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
index 859a6dde3..b58d350b3 100644
--- a/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_03.vtt.md
@@ -1,77 +1,33 @@
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0
+OK,
 
-00:00:04.963 --> 00:00:08.571
+I think now we should be recording
 
-Speaker A:  OK, I think now we should be recording
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-1
-
-00:00:08.571 --> 00:00:09.403
-
-Speaker A:  properly.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0
-
-00:00:10.683 --> 00:00:11.563
+properly.
 
 Good.
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/17-0
-
-00:00:13.363 --> 00:00:13.803
-
-Speaker A:  Yeah.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/78-0
-
-00:00:49.603 --> 00:00:53.363
-
-Speaker B:  I was also thinking.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-0
-
-00:00:54.963 --> 00:01:02.072
-
-Speaker B:  Would be maybe good to create items,
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/113-1
-
-00:01:02.072 --> 00:01:06.811
-
-Speaker B:  some metadata, some options that can be specific.
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-0
-
-00:01:10.243 --> 00:01:13.014
-
-Speaker A:  Yeah, I mean I think you went even more than
-
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/119-0
-
-00:01:10.563 --> 00:01:12.643
-
-Speaker B:  But we preserved the atoms.
+Yeah.
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/150-1
+I was also thinking.
 
-00:01:13.014 --> 00:01:15.907
+Would be maybe good to create items,
 
-Speaker A:  than me. I just opened the format.
+some metadata,
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/197-1
+some options that can be specific.
 
-00:01:50.222 --> 00:01:51.643
+Yeah,
 
-Speaker A:  give it a try, yeah.
+I mean I think you went even more than
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/200-0
+But we preserved the atoms.
 
-00:01:52.043 --> 00:01:55.043
+than me.
 
-Speaker B:  Okay, talk to you later.
+I just opened the format.
 
-62357a1d-d250-41d5-a1cf-6cc0eeceffcc/202-0
+give it a try, yeah.
 
-00:01:54.603 --> 00:01:55.283
+Okay, talk to you later.
 
-Speaker A:  See you.
\ No newline at end of file
+See you.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
new file mode 100644
index 000000000..93feba5e9
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.itxt
@@ -0,0 +1,14 @@
+item-0 at level 0: unspecified: group _root_
+  item-1 at level 1: text: Last night the chef surprised us with a culinary adventure.
+  item-2 at level 1: inline: group WebVTT cue span
+    item-3 at level 2: text: The waiter offered a 
+    item-4 at level 2: text: steaming bowl of 
+    item-5 at level 2: text: paella
+    item-6 at level 2: text:  that instantly transported the diners to a sunny Mediterranean coast.
+  item-7 at level 1: inline: group WebVTT cue span
+    item-8 at level 2: text: The dessert’s 
+    item-9 at level 2: text: unexpected
+    item-10 at level 2: text:  
+    item-11 at level 2: text: arcobaleno
+    item-12 at level 2: text:  of flavors
+    item-13 at level 2: text:  left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
new file mode 100644
index 000000000..17ab9f501
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.json
@@ -0,0 +1,344 @@
+{
+  "schema_name": "DoclingDocument",
+  "version": "1.8.0",
+  "name": "webvtt_example_04",
+  "origin": {
+    "mimetype": "text/vtt",
+    "binary_hash": 5389775195091554844,
+    "filename": "webvtt_example_04.vtt"
+  },
+  "furniture": {
+    "self_ref": "#/furniture",
+    "children": [],
+    "content_layer": "furniture",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "body": {
+    "self_ref": "#/body",
+    "children": [
+      {
+        "$ref": "#/texts/0"
+      },
+      {
+        "$ref": "#/groups/0"
+      },
+      {
+        "$ref": "#/groups/1"
+      }
+    ],
+    "content_layer": "body",
+    "name": "_root_",
+    "label": "unspecified"
+  },
+  "groups": [
+    {
+      "self_ref": "#/groups/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/1"
+        },
+        {
+          "$ref": "#/texts/2"
+        },
+        {
+          "$ref": "#/texts/3"
+        },
+        {
+          "$ref": "#/texts/4"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    },
+    {
+      "self_ref": "#/groups/1",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [
+        {
+          "$ref": "#/texts/5"
+        },
+        {
+          "$ref": "#/texts/6"
+        },
+        {
+          "$ref": "#/texts/7"
+        },
+        {
+          "$ref": "#/texts/8"
+        },
+        {
+          "$ref": "#/texts/9"
+        },
+        {
+          "$ref": "#/texts/10"
+        }
+      ],
+      "content_layer": "body",
+      "name": "WebVTT cue span",
+      "label": "inline"
+    }
+  ],
+  "texts": [
+    {
+      "self_ref": "#/texts/0",
+      "parent": {
+        "$ref": "#/body"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14580.0,
+          "end_time": 14760.0,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "Last night the chef surprised us with a culinary adventure.",
+      "text": "Last night the chef surprised us with a culinary adventure."
+    },
+    {
+      "self_ref": "#/texts/1",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The waiter offered a ",
+      "text": "The waiter offered a "
+    },
+    {
+      "self_ref": "#/texts/2",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "steaming bowl of ",
+      "text": "steaming bowl of ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/3",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "es-ES"
+          ]
+        }
+      ],
+      "orig": "paella",
+      "text": "paella",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/4",
+      "parent": {
+        "$ref": "#/groups/0"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " that instantly transported the diners to a sunny Mediterranean coast.",
+      "text": " that instantly transported the diners to a sunny Mediterranean coast."
+    },
+    {
+      "self_ref": "#/texts/5",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": "The dessert’s ",
+      "text": "The dessert’s "
+    },
+    {
+      "self_ref": "#/texts/6",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "classes": [
+            "b.loud"
+          ]
+        }
+      ],
+      "orig": "unexpected",
+      "text": "unexpected",
+      "formatting": {
+        "bold": true,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/7",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " ",
+      "text": " ",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/8",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234",
+          "languages": [
+            "it"
+          ]
+        }
+      ],
+      "orig": "arcobaleno",
+      "text": "arcobaleno",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": true,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/9",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " of flavors",
+      "text": " of flavors",
+      "formatting": {
+        "bold": false,
+        "italic": true,
+        "underline": false,
+        "strikethrough": false,
+        "script": "baseline"
+      }
+    },
+    {
+      "self_ref": "#/texts/10",
+      "parent": {
+        "$ref": "#/groups/1"
+      },
+      "children": [],
+      "content_layer": "body",
+      "label": "text",
+      "prov": [
+        {
+          "start_time": 14760.0,
+          "end_time": 14818.239,
+          "identifier": "agcvs-08234"
+        }
+      ],
+      "orig": " left everyone in awe.",
+      "text": " left everyone in awe."
+    }
+  ],
+  "pictures": [],
+  "tables": [],
+  "key_value_items": [],
+  "form_items": [],
+  "pages": {}
+}
\ No newline at end of file
diff --git a/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
new file mode 100644
index 000000000..f2312a059
--- /dev/null
+++ b/tests/data/groundtruth/docling_v2/webvtt_example_04.vtt.md
@@ -0,0 +1,5 @@
+Last night the chef surprised us with a culinary adventure.
+
+The waiter offered a  *steaming bowl of * *paella*  that instantly transported the diners to a sunny Mediterranean coast.
+
+The dessert’s  ***unexpected*** * * *arcobaleno* * of flavors*  left everyone in awe.
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_02.vtt b/tests/data/webvtt/webvtt_example_02.vtt
index 1152a1e8f..6bd182101 100644
--- a/tests/data/webvtt/webvtt_example_02.vtt
+++ b/tests/data/webvtt/webvtt_example_02.vtt
@@ -12,4 +12,7 @@ NOTE Copyright © 2019 World Wide Web Consortium. https://www.w3.org/TR/webvtt1/
 <v Esme>Hee!</v> <i>laughter</i>
 
 00:06.000 --> 00:08.000
-<v.loud Mary>That’s awesome!
\ No newline at end of file
+<v.loud Mary>That’s awesome!
+
+00:08.000 --> 00:10.000
+Sur les <i.foreignphrase><lang en>playground</lang></i>, ici à Montpellier
\ No newline at end of file
diff --git a/tests/data/webvtt/webvtt_example_04.vtt b/tests/data/webvtt/webvtt_example_04.vtt
new file mode 100644
index 000000000..fd7b788c0
--- /dev/null
+++ b/tests/data/webvtt/webvtt_example_04.vtt
@@ -0,0 +1,10 @@
+WEBVTT
+
+agcvs-08234
+04:03:00.000 --> 04:06:00.000
+Last night the chef surprised us with a culinary adventure.
+
+agcvs-08234
+04:06:00.000 --> 04:06:58.239
+The waiter offered a <i>steaming bowl of <lang es-ES>paella</lang></i> that instantly transported the diners to a sunny Mediterranean coast.
+The dessert’s <i><b.loud>unexpected</b> <u><lang it>arcobaleno</lang></u> of flavors</i> left everyone in awe.
\ No newline at end of file
diff --git a/tests/test_backend_vtt.py b/tests/test_backend_vtt.py
index a910671bb..54e91219d 100644
--- a/tests/test_backend_vtt.py
+++ b/tests/test_backend_vtt.py
@@ -1,19 +1,7 @@
-# Assisted by watsonx Code Assistant
-
 from pathlib import Path
 
-import pytest
 from docling_core.types.doc import DoclingDocument
-from pydantic import ValidationError
 
-from docling.backend.webvtt_backend import (
-    _WebVTTCueItalicSpan,
-    _WebVTTCueTextSpan,
-    _WebVTTCueTimings,
-    _WebVTTCueVoiceSpan,
-    _WebVTTFile,
-    _WebVTTTimestamp,
-)
 from docling.datamodel.base_models import InputFormat
 from docling.datamodel.document import ConversionResult
 from docling.document_converter import DocumentConverter
@@ -24,187 +12,6 @@
 GENERATE = GEN_TEST_DATA
 
 
-def test_vtt_cue_commponents():
-    """Test WebVTT components."""
-    valid_timestamps = [
-        "00:01:02.345",
-        "12:34:56.789",
-        "02:34.567",
-        "00:00:00.000",
-    ]
-    valid_total_seconds = [
-        1 * 60 + 2.345,
-        12 * 3600 + 34 * 60 + 56.789,
-        2 * 60 + 34.567,
-        0.0,
-    ]
-    for idx, ts in enumerate(valid_timestamps):
-        model = _WebVTTTimestamp(raw=ts)
-        assert model.seconds == valid_total_seconds[idx]
-
-    """Test invalid WebVTT timestamps."""
-    invalid_timestamps = [
-        "00:60:02.345",  # minutes > 59
-        "00:01:60.345",  # seconds > 59
-        "00:01:02.1000",  # milliseconds > 999
-        "01:02:03",  # missing milliseconds
-        "01:02",  # missing milliseconds
-        ":01:02.345",  # extra : for missing hours
-        "abc:01:02.345",  # invalid format
-    ]
-    for ts in invalid_timestamps:
-        with pytest.raises(ValidationError):
-            _WebVTTTimestamp(raw=ts)
-
-    """Test the timestamp __str__ method."""
-    model = _WebVTTTimestamp(raw="00:01:02.345")
-    assert str(model) == "00:01:02.345"
-
-    """Test valid cue timings."""
-    start = _WebVTTTimestamp(raw="00:10.005")
-    end = _WebVTTTimestamp(raw="00:14.007")
-    cue_timings = _WebVTTCueTimings(start=start, end=end)
-    assert cue_timings.start == start
-    assert cue_timings.end == end
-    assert str(cue_timings) == "00:10.005 --> 00:14.007"
-
-    """Test invalid cue timings with end timestamp before start."""
-    start = _WebVTTTimestamp(raw="00:10.700")
-    end = _WebVTTTimestamp(raw="00:10.500")
-    with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start, end=end)
-    assert "End timestamp must be greater than start timestamp" in str(excinfo.value)
-
-    """Test invalid cue timings with missing end."""
-    start = _WebVTTTimestamp(raw="00:10.500")
-    with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(start=start)
-    assert "Field required" in str(excinfo.value)
-
-    """Test invalid cue timings with missing start."""
-    end = _WebVTTTimestamp(raw="00:10.500")
-    with pytest.raises(ValidationError) as excinfo:
-        _WebVTTCueTimings(end=end)
-    assert "Field required" in str(excinfo.value)
-
-    """Test with valid text."""
-    valid_text = "This is a valid cue text span."
-    span = _WebVTTCueTextSpan(text=valid_text)
-    assert span.text == valid_text
-    assert str(span) == valid_text
-
-    """Test with text containing newline characters."""
-    invalid_text = "This cue text span\ncontains a newline."
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
-
-    """Test with text containing ampersand."""
-    invalid_text = "This cue text span contains &."
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
-
-    """Test with text containing less-than sign."""
-    invalid_text = "This cue text span contains <."
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text=invalid_text)
-
-    """Test with empty text."""
-    with pytest.raises(ValidationError):
-        _WebVTTCueTextSpan(text="")
-
-    """Test that annotation validation works correctly."""
-    valid_annotation = "valid-annotation"
-    invalid_annotation = "invalid\nannotation"
-    with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=invalid_annotation)
-    assert _WebVTTCueVoiceSpan(annotation=valid_annotation)
-
-    """Test that classes validation works correctly."""
-    annotation = "speaker name"
-    valid_classes = ["class1", "class2"]
-    invalid_classes = ["class\nwith\nnewlines", ""]
-    with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, classes=invalid_classes)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, classes=valid_classes)
-
-    """Test that components validation works correctly."""
-    annotation = "speaker name"
-    valid_components = [_WebVTTCueTextSpan(text="random text")]
-    invalid_components = [123, "not a component"]
-    with pytest.raises(ValidationError):
-        _WebVTTCueVoiceSpan(annotation=annotation, components=invalid_components)
-    assert _WebVTTCueVoiceSpan(annotation=annotation, components=valid_components)
-
-    """Test valid cue voice spans."""
-    cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        classes=["loud", "clear"],
-        components=[_WebVTTCueTextSpan(text="random text")],
-    )
-
-    expected_str = "<v.loud.clear speaker>random text</v>"
-    assert str(cue_span) == expected_str
-
-    cue_span = _WebVTTCueVoiceSpan(
-        annotation="speaker",
-        components=[_WebVTTCueTextSpan(text="random text")],
-    )
-    expected_str = "<v speaker>random text</v>"
-    assert str(cue_span) == expected_str
-
-
-def test_webvtt_file():
-    """Test WebVTT files."""
-    with open("./tests/data/webvtt/webvtt_example_01.vtt", encoding="utf-8") as f:
-        content = f.read()
-        vtt = _WebVTTFile.parse(content)
-    assert len(vtt) == 13
-    block = vtt.cue_blocks[11]
-    assert str(block.timings) == "00:32.500 --> 00:33.500"
-    assert len(block.payload) == 1
-    cue_span = block.payload[0]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
-    assert cue_span.annotation == "Neil deGrasse Tyson"
-    assert not cue_span.classes
-    assert len(cue_span.components) == 1
-    comp = cue_span.components[0]
-    assert isinstance(comp, _WebVTTCueItalicSpan)
-    assert len(comp.components) == 1
-    comp2 = comp.components[0]
-    assert isinstance(comp2, _WebVTTCueTextSpan)
-    assert comp2.text == "Laughs"
-
-    with open("./tests/data/webvtt/webvtt_example_02.vtt", encoding="utf-8") as f:
-        content = f.read()
-        vtt = _WebVTTFile.parse(content)
-    assert len(vtt) == 4
-    reverse = (
-        "WEBVTT\n\nNOTE Copyright © 2019 World Wide Web Consortium. "
-        "https://www.w3.org/TR/webvtt1/\n\n"
-    )
-    reverse += "\n\n".join([str(block) for block in vtt.cue_blocks])
-    assert content == reverse
-
-    with open("./tests/data/webvtt/webvtt_example_03.vtt", encoding="utf-8") as f:
-        content = f.read()
-        vtt = _WebVTTFile.parse(content)
-    assert len(vtt) == 13
-    for block in vtt:
-        assert block.identifier
-    block = vtt.cue_blocks[0]
-    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/15-0"
-    assert str(block.timings) == "00:00:04.963 --> 00:00:08.571"
-    assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueVoiceSpan)
-    block = vtt.cue_blocks[2]
-    assert isinstance(cue_span, _WebVTTCueVoiceSpan)
-    assert block.identifier == "62357a1d-d250-41d5-a1cf-6cc0eeceffcc/16-0"
-    assert str(block.timings) == "00:00:10.683 --> 00:00:11.563"
-    assert len(block.payload) == 1
-    assert isinstance(block.payload[0], _WebVTTCueTextSpan)
-    assert block.payload[0].text == "Good."
-
-
 def test_e2e_vtt_conversions():
     directory = Path("./tests/data/webvtt/")
     vtt_paths = sorted(directory.rglob("*.vtt"))
diff --git a/tests/verify_utils.py b/tests/verify_utils.py
index 93f33e1fd..ad7eafa98 100644
--- a/tests/verify_utils.py
+++ b/tests/verify_utils.py
@@ -10,6 +10,8 @@
     DoclingDocument,
     FormulaItem,
     PictureItem,
+    ProvenanceItem,
+    ProvenanceTrack,
     TableItem,
     TextItem,
 )
@@ -237,7 +239,30 @@ def verify_docitems(doc_pred: DoclingDocument, doc_true: DoclingDocument, fuzzy:
             true_prov = true_item.prov[0]
             pred_prov = pred_item.prov[0]
 
-            assert true_prov.page_no == pred_prov.page_no, "Page provenance mistmatch"
+            assert type(pred_prov) is type(true_prov), "Provenance type mismatch"
+            if isinstance(pred_prov, ProvenanceItem):
+                assert true_prov.page_no == pred_prov.page_no, (
+                    "Page provenance mistmatch"
+                )
+            elif isinstance(pred_prov, ProvenanceTrack):
+                assert true_prov.start_time._seconds == pred_prov.start_time._seconds, (
+                    "ProvenanceTrack start time mismatch"
+                )
+                assert true_prov.end_time._seconds == pred_prov.end_time._seconds, (
+                    "ProvenanceTrack end time mismatch"
+                )
+                assert true_prov.languages == pred_prov.languages, (
+                    "ProvenanceTrack languages mismatch"
+                )
+                assert true_prov.classes == pred_prov.classes, (
+                    "ProvenanceTrack classes mismatch"
+                )
+                assert true_prov.identifier == pred_prov.identifier, (
+                    "ProvenanceTrack identifier mismatch"
+                )
+                assert true_prov.voice == pred_prov.voice, (
+                    "ProvenanceTrack voice mismatch"
+                )
 
             # TODO: add bbox check with tolerance
 
diff --git a/uv.lock b/uv.lock
index 6548b79f4..cd0663a3d 100644
--- a/uv.lock
+++ b/uv.lock
@@ -1546,7 +1546,7 @@ requires-dist = [
     { name = "accelerate", marker = "extra == 'vlm'", specifier = ">=1.2.1,<2.0.0" },
     { name = "beautifulsoup4", specifier = ">=4.12.3,<5.0.0" },
     { name = "certifi", specifier = ">=2024.7.4" },
-    { name = "docling-core", extras = ["chunking"], specifier = ">=2.50.1,<3.0.0" },
+    { name = "docling-core", extras = ["chunking"], git = "ssh://git@github.com/docling-project/docling-core.git?rev=c75516516358f25add2682674fc7dc6eef2c5164" },
     { name = "docling-ibm-models", specifier = ">=3.9.1,<4" },
     { name = "docling-parse", specifier = ">=4.7.0,<5.0.0" },
     { name = "easyocr", marker = "extra == 'easyocr'", specifier = ">=1.7,<2.0" },
@@ -1631,8 +1631,8 @@ examples = [
 
 [[package]]
 name = "docling-core"
-version = "2.51.1"
-source = { registry = "https://pypi.org/simple" }
+version = "2.55.0"
+source = { git = "ssh://git@github.com/docling-project/docling-core.git?rev=c75516516358f25add2682674fc7dc6eef2c5164#c75516516358f25add2682674fc7dc6eef2c5164" }
 dependencies = [
     { name = "jsonref" },
     { name = "jsonschema" },
@@ -1645,10 +1645,6 @@ dependencies = [
     { name = "typer" },
     { name = "typing-extensions" },
 ]
-sdist = { url = "https://files.pythonhosted.org/packages/33/76/665a61f6208923fb312549d9c7a2ef5275bdd7fd4d83cbe8ddd668f2fa35/docling_core-2.51.1.tar.gz", hash = "sha256:f5b0d8ead535c8451f67f9545af007f5bebfda72744a8e90af6e83fb6a483a99", size = 184664, upload-time = "2025-11-14T13:33:48.586Z" }
-wheels = [
-    { url = "https://files.pythonhosted.org/packages/a6/06/911a7374d59afff0dd8b50f84e1b7e5c4452886bbbe0e31e04510f44d43e/docling_core-2.51.1-py3-none-any.whl", hash = "sha256:76ca2b4c5c1d33475583671fe584b390e769152cac48d1fb24bf5a7457864a66", size = 186005, upload-time = "2025-11-14T13:33:46.695Z" },
-]
 
 [package.optional-dependencies]
 chunking = [
@@ -6119,6 +6115,9 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/20/8a/b35a615ae6f04550d696bb179c414538b3b477999435fdd4ad75b76139e4/pybase64-1.4.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:a370dea7b1cee2a36a4d5445d4e09cc243816c5bc8def61f602db5a6f5438e52", size = 54320, upload-time = "2025-07-27T13:03:27.495Z" },
     { url = "https://files.pythonhosted.org/packages/d3/a9/8bd4f9bcc53689f1b457ecefed1eaa080e4949d65a62c31a38b7253d5226/pybase64-1.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:9aa4de83f02e462a6f4e066811c71d6af31b52d7484de635582d0e3ec3d6cc3e", size = 56482, upload-time = "2025-07-27T13:03:28.942Z" },
     { url = "https://files.pythonhosted.org/packages/75/e5/4a7735b54a1191f61c3f5c2952212c85c2d6b06eb5fb3671c7603395f70c/pybase64-1.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:83a1c2f9ed00fee8f064d548c8654a480741131f280e5750bb32475b7ec8ee38", size = 70959, upload-time = "2025-07-27T13:03:30.171Z" },
+    { url = "https://files.pythonhosted.org/packages/f4/56/5337f27a8b8d2d6693f46f7b36bae47895e5820bfa259b0072574a4e1057/pybase64-1.4.2-cp313-cp313-android_21_arm64_v8a.whl", hash = "sha256:0f331aa59549de21f690b6ccc79360ffed1155c3cfbc852eb5c097c0b8565a2b", size = 33888, upload-time = "2025-07-27T13:03:35.698Z" },
+    { url = "https://files.pythonhosted.org/packages/e3/ff/470768f0fe6de0aa302a8cb1bdf2f9f5cffc3f69e60466153be68bc953aa/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphoneos.whl", hash = "sha256:69d3f0445b0faeef7bb7f93bf8c18d850785e2a77f12835f49e524cc54af04e7", size = 30914, upload-time = "2025-07-27T13:03:38.475Z" },
+    { url = "https://files.pythonhosted.org/packages/75/6b/d328736662665e0892409dc410353ebef175b1be5eb6bab1dad579efa6df/pybase64-1.4.2-cp313-cp313-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:2372b257b1f4dd512f317fb27e77d313afd137334de64c87de8374027aacd88a", size = 31380, upload-time = "2025-07-27T13:03:39.7Z" },
     { url = "https://files.pythonhosted.org/packages/ca/96/7ff718f87c67f4147c181b73d0928897cefa17dc75d7abc6e37730d5908f/pybase64-1.4.2-cp313-cp313-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:fb794502b4b1ec91c4ca5d283ae71aef65e3de7721057bd9e2b3ec79f7a62d7d", size = 38230, upload-time = "2025-07-27T13:03:41.637Z" },
     { url = "https://files.pythonhosted.org/packages/71/ab/db4dbdfccb9ca874d6ce34a0784761471885d96730de85cee3d300381529/pybase64-1.4.2-cp313-cp313-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:d377d48acf53abf4b926c2a7a24a19deb092f366a04ffd856bf4b3aa330b025d", size = 71608, upload-time = "2025-07-27T13:03:47.01Z" },
     { url = "https://files.pythonhosted.org/packages/f2/58/7f2cef1ceccc682088958448d56727369de83fa6b29148478f4d2acd107a/pybase64-1.4.2-cp313-cp313-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:ab9cdb6a8176a5cb967f53e6ad60e40c83caaa1ae31c5e1b29e5c8f507f17538", size = 56413, upload-time = "2025-07-27T13:03:49.908Z" },
@@ -6140,6 +6139,8 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/95/f0/c392c4ac8ccb7a34b28377c21faa2395313e3c676d76c382642e19a20703/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:ad59362fc267bf15498a318c9e076686e4beeb0dfe09b457fabbc2b32468b97a", size = 58103, upload-time = "2025-07-27T13:04:29.996Z" },
     { url = "https://files.pythonhosted.org/packages/32/30/00ab21316e7df8f526aa3e3dc06f74de6711d51c65b020575d0105a025b2/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_s390x.whl", hash = "sha256:01593bd064e7dcd6c86d04e94e44acfe364049500c20ac68ca1e708fbb2ca970", size = 60779, upload-time = "2025-07-27T13:04:31.549Z" },
     { url = "https://files.pythonhosted.org/packages/a6/65/114ca81839b1805ce4a2b7d58bc16e95634734a2059991f6382fc71caf3e/pybase64-1.4.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:5b81547ad8ea271c79fdf10da89a1e9313cb15edcba2a17adf8871735e9c02a0", size = 74684, upload-time = "2025-07-27T13:04:32.976Z" },
+    { url = "https://files.pythonhosted.org/packages/99/bf/00a87d951473ce96c8c08af22b6983e681bfabdb78dd2dcf7ee58eac0932/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphoneos.whl", hash = "sha256:4157ad277a32cf4f02a975dffc62a3c67d73dfa4609b2c1978ef47e722b18b8e", size = 30924, upload-time = "2025-07-27T13:04:39.189Z" },
+    { url = "https://files.pythonhosted.org/packages/ae/43/dee58c9d60e60e6fb32dc6da722d84592e22f13c277297eb4ce6baf99a99/pybase64-1.4.2-cp314-cp314-ios_13_0_arm64_iphonesimulator.whl", hash = "sha256:e113267dc349cf624eb4f4fbf53fd77835e1aa048ac6877399af426aab435757", size = 31390, upload-time = "2025-07-27T13:04:40.995Z" },
     { url = "https://files.pythonhosted.org/packages/e1/11/b28906fc2e330b8b1ab4bc845a7bef808b8506734e90ed79c6062b095112/pybase64-1.4.2-cp314-cp314-ios_13_0_x86_64_iphonesimulator.whl", hash = "sha256:cea5aaf218fd9c5c23afacfe86fd4464dfedc1a0316dd3b5b4075b068cc67df0", size = 38212, upload-time = "2025-07-27T13:04:42.729Z" },
     { url = "https://files.pythonhosted.org/packages/e4/2e/851eb51284b97354ee5dfa1309624ab90920696e91a33cd85b13d20cc5c1/pybase64-1.4.2-cp314-cp314-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:a3e54dcf0d0305ec88473c9d0009f698cabf86f88a8a10090efeff2879c421bb", size = 71674, upload-time = "2025-07-27T13:04:49.294Z" },
     { url = "https://files.pythonhosted.org/packages/a4/8e/3479266bc0e65f6cc48b3938d4a83bff045330649869d950a378f2ddece0/pybase64-1.4.2-cp314-cp314-manylinux2014_armv7l.manylinux_2_17_armv7l.whl", hash = "sha256:753da25d4fd20be7bda2746f545935773beea12d5cb5ec56ec2d2960796477b1", size = 56461, upload-time = "2025-07-27T13:04:52.37Z" },