@@ -1109,10 +1109,8 @@ class LxmlParser:
1109
1109
1110
1110
expression : str
1111
1111
method : str
1112
- exclude : Optional [str ]
1113
1112
namespaces : Optional [Dict [str , str ]]
1114
1113
skip : int
1115
- maxitems : int
1116
1114
1117
1115
def __init__ (
1118
1116
self : Union ['LxmlParser' , 'CSSFilter' , 'XPathFilter' ],
@@ -1143,8 +1141,7 @@ def __init__(
1143
1141
f"The '{ filter_kind } ' filter's namespace prefixes are only supported with 'method: xml'. "
1144
1142
f'({ job .get_indexed_location ()} )'
1145
1143
)
1146
- # for the below, see https://lxml.de/FAQ.html#why-can-t-lxml-parse-my-xml-from-unicode-strings
1147
- self .parser = etree .HTMLParser () if self .method == 'html' else etree .XMLParser ()
1144
+ self .parser = etree .HTMLParser () if self .method == 'html' else etree .XMLParser () # etree._FeedParser
1148
1145
self .data = ''
1149
1146
1150
1147
def feed (self , data : str ) -> None :
@@ -1220,11 +1217,20 @@ def _orphaned(self, element: etree.Element) -> bool:
1220
1217
def _get_filtered_elements (self ) -> List [Union [etree .Element , str ]]:
1221
1218
if self .method == 'xml' and isinstance (self .data , str ):
1222
1219
# see https://lxml.de/FAQ.html#why-can-t-lxml-parse-my-xml-from-unicode-strings
1223
- root = etree .fromstring ( # nosec B320: use defusedxml TODO
1224
- self .data .encode (errors = 'xmlcharrefreplace' ), self .parser
1225
- )
1220
+ data : Union [str , bytes ] = self .data .encode (errors = 'xmlcharrefreplace' )
1221
+ elif self .method == 'html' and self .data .startswith ('<?xml' ):
1222
+ # handle legacy https://stackoverflow.com/questions/37592045/
1223
+ data = self .data .split ('>' , maxsplit = 1 )[1 ]
1226
1224
else :
1227
- root = etree .fromstring (self .data , self .parser ) # nosec B320: use defusedxml TODO
1225
+ data = self .data
1226
+ try :
1227
+ root = etree .fromstring (data , self .parser ) # nosec B320: use defusedxml TODO
1228
+ except ValueError as e :
1229
+ args = (
1230
+ f"Filter '{ self .filter_kind } ' encountered the following error when parsing the data. Check that "
1231
+ f"'method: { self .method } ' is the correct one.\n { type (e ).__name__ } : { e .args [0 ]} "
1232
+ )
1233
+ raise RuntimeError (args ) from None
1228
1234
if root is None :
1229
1235
return []
1230
1236
selected_elems : Optional [List [etree .Element ]] = None
0 commit comments