-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathJPHTMLParser.py
49 lines (40 loc) · 1.36 KB
/
JPHTMLParser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import re
import bs4
'''
This is early world of Beautiful Soup Wrapper.
'''
class JPHTMLParser:
@staticmethod
def __object_validation(html_page):
if not isinstance(html_page, bs4.element.Tag):
html_page = bs4.BeautifulSoup(html_page, 'html.parser')
return html_page
@staticmethod
def find_doms_by_regex(html_page, regex=None):
html_page = JPHTMLParser.__object_validation(html_page)
matches = html_page.find_all(id=re.compile(regex))
return matches
@staticmethod
def find_dom_by_id(html_page, dom_id=None):
pass
@staticmethod
def find_doms_by_class(html_page, html_tag, css_class):
html_page = JPHTMLParser.__object_validation(html_page)
matches = html_page.find_all(html_tag, class_=css_class)
return matches
@staticmethod
def find_dom_by_class(html_page, html_tag, css_class):
html_page = JPHTMLParser.__object_validation(html_page)
matches = html_page.find(html_tag, class_=css_class)
return matches
@staticmethod
def find_dom_by_tag(html_page, html_tag):
html_page = JPHTMLParser.__object_validation(html_page)
matches = html_page.find(html_tag)
return matches
@staticmethod
def remove_all_tags(html_page):
# NOTE: html_page might change the original reference. But, we are not making a clone to respect the time complexity.
for data in html_page(['style']):
data.decompose()
return ' '.join(html_page.stripped_strings)