add html feature extraction: subtrees and tokenization

ilyalasy · Jan 6, 2023 · ce0ded5 · ce0ded5
1 parent 565685d
commit ce0ded5
Show file tree

Hide file tree

Showing 3 changed files with 418 additions and 0 deletions.
diff --git a/data/tags.txt b/data/tags.txt
@@ -0,0 +1,106 @@
+a
+abbr
+address
+area
+article
+aside
+audio
+b
+base
+bdo
+blockquote
+body
+br
+button
+canvas
+caption
+cite
+code
+col
+colgroup
+command
+datalist
+dd
+del
+details
+dfn
+div
+dl
+dt
+em
+embed
+fieldset
+figcaption
+figure
+footer
+form
+h1
+h2
+h3
+h4
+h5
+head
+header
+hr
+html
+i
+iframe
+img
+input
+ins
+kbd
+label
+legend
+li
+link
+map
+main
+mark
+menu
+meta
+meter
+nav
+noscript
+object
+ol
+optgroup
+option
+output
+p
+param
+pre
+progress
+q
+rp
+rt
+ruby
+s
+samp
+script
+section
+select
+small
+source
+span
+strong
+style
+sub
+sup
+svg
+table
+tbody
+td
+textarea
+tfoot
+th
+thead
+time
+title
+tr
+track
+u
+ul
+var
+video
+wbr
+UNK
diff --git a/src/html_utils.py b/src/html_utils.py
@@ -0,0 +1,36 @@
+from lxml import etree
+from lxml.html.clean import Cleaner
+from lxml.html.soupparser import fromstring
+import re
+import unicodedata
+
+def clean_spaces(text):
+    return " ".join(re.split(r"\s+", text.strip()))
+
+def clean_format_str(text):
+    text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
+    text = "".join([c if ord(c) < 128 else "" for c in text])
+    text = clean_spaces(text)
+    return text
+
+cleaner = Cleaner()
+cleaner.scripts = True
+cleaner.javascript = True
+cleaner.comments = True
+cleaner.style = False
+cleaner.links = False
+cleaner.page_structure = False
+cleaner.embedded = False
+cleaner.frames = False
+cleaner.forms = False
+cleaner.annoying_tags = False
+cleaner.remove_unknown_tags = False
+cleaner.safe_attrs_only = False
+
+def get_cleaned_body(html):
+    html = html.replace("\0", "")  # Delete NULL bytes.
+    html = clean_format_str(html)
+    etree_root = fromstring(html)
+    etree_root = cleaner.clean_html(etree_root)
+    dom_tree = etree.ElementTree(etree_root)
+    return dom_tree.getroot().body