-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcity_category_extractor.py
47 lines (37 loc) · 1.2 KB
/
city_category_extractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
__author__ = 'vasyanya'
from bs4 import BeautifulSoup
import re
class CityCategoryData:
def __init__(self):
self.link = ''
self.category = None
self.text = None
def extract(content):
soup = BeautifulSoup(content)
results = []
all_links = soup.find_all('a')
vacancy_link_regex_text = '/vacancy/(?P<city>\w+)/(?:category/#(?P<category>\w+))?'
vacancy_link_regex = re.compile(vacancy_link_regex_text, re.UNICODE)
for link in all_links:
href = link.get('href')
if not 'vacancy' in href:
continue
# Parse link
m = vacancy_link_regex.match(href)
groups = m.groups()
city = groups[0]
category = groups[1]
# Analyse parsed data. City is required, category and text are optional.
data = CityCategoryData()
data.link = city
print_text = "City: " + city
if not category is None:
print_text += ", category: " + category
data.category = category
text = link.getText()
if len(text) > 0:
print_text += ", text: " + text
data.text = text
#print print_text
results.append(data)
return results