Skip to content

Commit

Permalink
add html feature extraction: subtrees and tokenization
Browse files Browse the repository at this point in the history
  • Loading branch information
ilyalasy committed Jan 6, 2023
1 parent 565685d commit ce0ded5
Show file tree
Hide file tree
Showing 3 changed files with 418 additions and 0 deletions.
106 changes: 106 additions & 0 deletions data/tags.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
a
abbr
address
area
article
aside
audio
b
base
bdo
blockquote
body
br
button
canvas
caption
cite
code
col
colgroup
command
datalist
dd
del
details
dfn
div
dl
dt
em
embed
fieldset
figcaption
figure
footer
form
h1
h2
h3
h4
h5
head
header
hr
html
i
iframe
img
input
ins
kbd
label
legend
li
link
map
main
mark
menu
meta
meter
nav
noscript
object
ol
optgroup
option
output
p
param
pre
progress
q
rp
rt
ruby
s
samp
script
section
select
small
source
span
strong
style
sub
sup
svg
table
tbody
td
textarea
tfoot
th
thead
time
title
tr
track
u
ul
var
video
wbr
UNK
36 changes: 36 additions & 0 deletions src/html_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
from lxml import etree
from lxml.html.clean import Cleaner
from lxml.html.soupparser import fromstring
import re
import unicodedata

def clean_spaces(text):
return " ".join(re.split(r"\s+", text.strip()))

def clean_format_str(text):
text = "".join(ch for ch in text if unicodedata.category(ch)[0] != "C")
text = "".join([c if ord(c) < 128 else "" for c in text])
text = clean_spaces(text)
return text

cleaner = Cleaner()
cleaner.scripts = True
cleaner.javascript = True
cleaner.comments = True
cleaner.style = False
cleaner.links = False
cleaner.page_structure = False
cleaner.embedded = False
cleaner.frames = False
cleaner.forms = False
cleaner.annoying_tags = False
cleaner.remove_unknown_tags = False
cleaner.safe_attrs_only = False

def get_cleaned_body(html):
html = html.replace("\0", "") # Delete NULL bytes.
html = clean_format_str(html)
etree_root = fromstring(html)
etree_root = cleaner.clean_html(etree_root)
dom_tree = etree.ElementTree(etree_root)
return dom_tree.getroot().body
Loading

0 comments on commit ce0ded5

Please sign in to comment.