Skip to content

Commit

Permalink
Partial completion of attribute strings should be fully normalized wh…
Browse files Browse the repository at this point in the history
…en set #48
  • Loading branch information
servilla committed Feb 17, 2024
1 parent d782c01 commit 99579d9
Show file tree
Hide file tree
Showing 2 changed files with 109 additions and 0 deletions.
68 changes: 68 additions & 0 deletions src/metapype/model/normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
:Mod:
normalize
:Synopsis:
Normalize an EML XML document instance or text content, including replacement of non-breaking space characters
with space characters.
:Author:
servilla
:Created:
2/17/24
"""

import daiquiri
from lxml import etree

logger = daiquiri.getLogger(__name__)

normalize_whitespace = \
"""<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
>
<xsl:output omit-xml-declaration="no" indent="yes"/>
<!-- Template to copy nodes and apply templates to attributes and child nodes -->
<xsl:template match="@*|node()">
<xsl:copy>
<!-- Apply templates to attributes first to normalize space, then to child nodes -->
<xsl:apply-templates select="@*"/>
<xsl:apply-templates select="node()"/>
</xsl:copy>
</xsl:template>
<!-- Template for normalizing space in text nodes, with specific exclusions -->
<xsl:template match="text()[not(ancestor::markup or ancestor::literalLayout or ancestor::objectName or ancestor::attributeName or ancestor::para)]">
<xsl:value-of select="normalize-space(.)"/>
</xsl:template>
<!-- Template to normalize space in attribute values -->
<xsl:template match="@*">
<!-- Create a new attribute with the same name but normalized value -->
<xsl:attribute name="{name()}">
<xsl:value-of select="normalize-space(.)"/>
</xsl:attribute>
</xsl:template>
</xsl:stylesheet>"""


def normalize(content: str, is_xml: bool = False) -> str:
"""
Normalize whitespace within a string, including replacement of non-breaking space characters
:param content: String content to be normalized
:param is_xml: Boolean to indicate if content is XML
:return: Normalized content as a unicode string
"""
if is_xml:
xslt = etree.XSLT(etree.XML(normalize_whitespace))
normalized = str(xslt(etree.XML(content.replace('\xa0', ' ').encode("utf-8"))))
else:
words = content.replace('\xa0', ' ').split(" ")
normalized = " ".join([word.strip() for word in words if word.strip() != ""])
return normalized
41 changes: 41 additions & 0 deletions tests/test_normalize.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
:Mod:
test_normalize
:Synopsis:
:Author:
servilla
:Created:
2/17/24
"""
import daiquiri

from metapype.model.normalize import normalize

logger = daiquiri.getLogger(__name__)


def test_normalize_xml():
test_xml = "<?xml version=\"1.0\"?><test a=\" test me \"><child> This is a test </child></test>"

expected = ('<?xml version="1.0"?>\n'
'<test a="test me">\n'
' <child>This is a test</child>\n'
'</test>\n')

normalized = normalize(test_xml, is_xml=True)
assert normalized == expected


def test_normalize_text():
test_text = " This is a test "

expected = "This is a test"

normalized = normalize(test_text, is_xml=False)
assert normalized == expected

0 comments on commit 99579d9

Please sign in to comment.