Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 2 additions & 4 deletions .github/workflows/dts-validator.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ name: DTS-Validator

on:
push:
branches:
- main
pull_request:

jobs:
Expand Down Expand Up @@ -39,10 +37,10 @@ jobs:
run: sleep 5 # Adjust as needed

- name: Install and run tests
run: |
run: |
cd dts-validator
python -m venv env
source env/bin/activate
pip install poetry
poetry install
pytest --entry-endpoint=http://localhost:5000
pytest --entry-endpoint=http://localhost:5000
2 changes: 1 addition & 1 deletion dapytains/processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
try:
if saxon_version == "PE":
import saxoncpe as saxonlib
elif saxon_version == "PE":
elif saxon_version == "EE":
import saxoncee as saxonlib
else:
import saxonche as saxonlib
Expand Down
26 changes: 15 additions & 11 deletions dapytains/tei/citeStructure.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@
from typing import Dict, List, Optional
from dataclasses import dataclass, field
from collections import namedtuple, defaultdict
from functools import cmp_to_key
from dapytains.processor import get_xpath_proc, saxonlib

_pos_re = re.compile(r'\[(\d+)\]')


@dataclass
class CiteData:
Expand Down Expand Up @@ -281,15 +282,6 @@ def find_refs_from_branches(
units = []
xpath_prefix = "./" if unit else ""

# Custom comparison function to compare nodes by document order
def compare_nodes_by_doc_order(node1, node2):
# Check if node1 precedes node2 in document order
precedes = xpath_proc.evaluate_single(f'{node1.xpath} << {node2.xpath}').string_value
if precedes == "true":
return -1 # node1 comes before node2

return 1

unsorted = []
for s in structure:
unsorted.extend(
Expand All @@ -303,7 +295,19 @@ def compare_nodes_by_doc_order(node1, node2):
_simple_node(ref, self.generate_xpath(ref), struct)
for ref, struct in unsorted
]
unsorted = sorted(unsorted, key=cmp_to_key(compare_nodes_by_doc_order))
# Generate a positional path key for each node once (O(n) JVM calls) and sort
# natively, rather than calling back into Saxon for every pairwise comparison
# (which would cost O(n log n) JVM round-trips).
def _doc_order_key(node):
# Count ALL preceding element siblings (not just same-name) so that
# mixed-name siblings at the same level sort in document order.
path_str = str(xpath_proc.evaluate_single(
f"string-join(for $n in ({node.xpath}/ancestor-or-self::*) "
f"return concat('/', name($n), '[', 1 + count($n/preceding-sibling::*), ']'), '')"
))
return tuple(int(x) for x in _pos_re.findall(path_str))

unsorted = sorted(unsorted, key=_doc_order_key)

units = []
for elem in unsorted:
Expand Down
26 changes: 12 additions & 14 deletions dapytains/tei/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,21 +37,19 @@ def xpath_walk(xpath: List[str]) -> Tuple[str, List[str], List[str]]:
return current_filled, queue, [xpath[0]] if len(xpath) > 1 else []


def is_traversing_xpath(parent: saxonlib.PyXdmNode, xpath: str, processor: saxonlib.PySaxonProcessor) -> bool:
def is_traversing_xpath(xpath_proc: saxonlib.PyXPathProcessor, xpath: str) -> bool:
""" Check if an XPath is traversing more than one level

:param parent:
:param xpath_proc: XPath processor with context already set to the parent node
:param xpath:
:return:
"""
xpath_proc = get_xpath_proc(parent, processor=processor)
if xpath.startswith(".//"):
# If the XPath starts with .//, we try to see if we have a direct child that matches
drct_xpath = xpath.replace(".//", "./", 1)
if xpath_proc.effective_boolean_value(f"head({xpath}) is head({drct_xpath})"):
return False
else:
return True
return True
return False


Expand All @@ -70,7 +68,7 @@ def xpath_walk_step(parent: saxonlib.PyXdmNode, xpath: str, processor: saxonlib.
xpath_proc = get_xpath_proc(parent, processor=processor)
# We check first for loops, because that changes the xpath
if xpath.startswith(".//"):
if is_traversing_xpath(parent, xpath, processor=processor):
if is_traversing_xpath(xpath_proc, xpath):
return xpath_proc.evaluate_single(f"./*[{xpath}]"), True
else:
return xpath_proc.evaluate_single(xpath), False
Expand Down Expand Up @@ -174,11 +172,12 @@ def copy_node(
_add_space_tail(element, node, processor=processor)
return element
elif parent is not None:
if not parent.getchildren():
existing = parent.getchildren()
if not existing:
if not isinstance(parent, (StringElement, ObjectifiedElement)):
parent.text = unescape((parent.text or "") + element)
else:
parent.getchildren()[-1].tail = unescape(element)
existing[-1].tail = unescape(element)
return parent

if node is None:
Expand Down Expand Up @@ -373,7 +372,7 @@ def reconstruct_doc(

# Given that both XPath returns the same node, we still need to check if end is looping
# We optimize by avoiding this check when start and end are the same
if start_xpath != end_xpath and is_traversing_xpath(root, current_end, processor=processor):
if start_xpath != end_xpath and is_traversing_xpath(xpath_proc, current_end):
queue_end = end_xpath

# If we have a child XPath, then continue the job
Expand Down Expand Up @@ -407,7 +406,7 @@ def reconstruct_doc(

# Given that both XPath returns the same node, we still need to check if end is looping
# We optimize by avoiding this check when start and end are the same
if start_xpath != end_xpath and is_traversing_xpath(root, current_end, processor=processor):
if start_xpath != end_xpath and is_traversing_xpath(xpath_proc, current_end):
queue_end = end_xpath

reconstruct_doc(
Expand All @@ -434,7 +433,7 @@ def reconstruct_doc(

# Given that both XPath returns the same node, we still need to check if end is looping
# We optimize by avoiding this check when start and end are the same
if start_xpath != end_xpath and is_traversing_xpath(root, current_end, processor=processor):
if start_xpath != end_xpath and is_traversing_xpath(xpath_proc, current_end):
queue_end = end_xpath

new_tree = reconstruct_doc(
Expand Down Expand Up @@ -486,8 +485,7 @@ def reconstruct_doc(
sib_current_end = clean_xpath_for_following(current_end, end_is_traversing)

# We look for siblings between start and end matches
xpath = get_xpath_proc(root, processor=processor)
for sibling in xpath_eval(xpath, f"./node()[preceding-sibling::{sib_current_start} and following-sibling::{sib_current_end}]"):
for sibling in xpath_eval(xpath_proc, f"./node()[preceding-sibling::{sib_current_start} and following-sibling::{sib_current_end}]"):
copy_node(sibling, include_children=True, parent=new_tree, processor=processor)

# Here we reached the end, logically.
Expand Down Expand Up @@ -622,7 +620,7 @@ def get_passage(self, ref_or_start: Optional[str], end: Optional[str] = None, tr
return root

def get_reffs(self, tree: Optional[str] = None):
tree = self.citeStructure[tree or self.default_tree]
tree: CiteStructureParser = self.citeStructure[tree or self.default_tree]
return tree.find_refs(root=self.xml, structure=tree.structure)

def get_next(self, tree, unit) -> Optional[CitableUnit]:
Expand Down
Loading
Loading