forked from code4lib/shortimer
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathminer.py
193 lines (159 loc) · 5.36 KB
/
miner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
# -*- coding: utf-8 -*-
import re
import json
import time
import codecs
import rfc822
import urllib
import logging
import datetime
import StringIO
import nltk
from shortimer.jobs.models import Job, Keyword, Subject
"""
Functions for doing text munging on job text.
"""
NOUN_CODES = ["NNP"]
# http://daringfireball.net/2010/07/improved_regex_for_matching_urls
URL_PATTERN = re.compile(r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))''')
def email_to_job(msg):
logging.info("looking at email with subject: %s", msg['subject'])
if not is_job_email(msg):
return None
if Job.objects.filter(email_message_id=msg['message-id']).count() == 1:
return None
logging.info("parsing job email %s", msg['message-id'])
j = Job()
j.contact_name, j.contact_email = rfc822.parseaddr(msg['from'])
j.contact_name = normalize_name(j.contact_name)
j.contact_email = j.contact_email.lower()
# get the employer
#j.from_domain = j.from_address.split('@')[1]
j.title = re.sub("^\[CODE4LIB\] ", "", msg['subject'])
j.title = re.sub("[\n\r]", "", j.title)
j.email_message_id = msg['message-id']
j.description = get_html(get_body(msg))
t = time.mktime(rfc822.parsedate(msg['date']))
j.post_date = datetime.datetime.fromtimestamp(t)
if not j.description:
logging.warn("missing body")
return None
if 'http://jobs.code4lib.org' in j.description:
logging.warn("not loading a job that shortimer posted")
return None
j.save()
autotag(j)
j.save()
return j
def autotag(job):
for n in nouns(job.description):
n = n.lower()
for subject in Subject.objects.filter(keywords__name=n):
job.subjects.add(subject)
def normalize_name(name):
if ',' in name:
parts = name.split(',')
parts = [p.strip() for p in parts]
first_name = parts.pop()
parts.insert(0, first_name)
name = ' '.join(parts)
return name
def get_html(text):
if text is not None:
html = "<p>" + text + "</p>"
html = html.replace("\n\n", "</p>\n\n<p>")
return re.sub(URL_PATTERN, r'<a href="\1">\1</a>', html)
else:
return None
def get_body(msg):
# pull out first text part to a multipart message
# not going to get in the business of extracting text from word, pdf, etc
if msg.is_multipart():
text_part = None
for m in msg.get_payload():
print m['content-type']
if m['content-type'].lower().startswith('text'):
text_part = m
break
if not text_part:
return None
else:
msg = text_part
charset = msg.get_content_charset()
if not charset:
logging.warn("no charset assuming utf8")
charset = "utf8"
try:
codec = codecs.getreader(charset)
except LookupError:
logging.warn("no codec for %s", charset)
return None
payload = StringIO.StringIO(msg.get_payload(decode=True))
reader = codec(payload)
body = ''.join(reader.readlines())
return body
def is_job_email(msg):
"""takes an email message and returns a boolean indicating whether the
message looks like a job ad.
"""
if not msg['subject']:
return False
subject = msg['subject'].lower()
if re.search('^re:', subject):
return False
if re.search('job', subject):
return True
if re.search('position', subject):
return True
if re.search('employment', subject):
return True
return False
def nouns(text):
"""returns proper nouns from a chunk of text
"""
nouns = []
for tag in tags(text):
word = tag[0]
is_proper_noun = tag[1] in NOUN_CODES
is_word = re.match("^[a-z]+$", tag[0], re.IGNORECASE)
if is_proper_noun and is_word:
nouns.append(tag[0])
elif len(nouns) > 0:
yield " ".join(nouns)
nouns = []
def tags(text):
"""returns some text with part of speech tagging
"""
words = nltk.word_tokenize(text)
return nltk.pos_tag(words)
def wikipedia_term(term):
"""Pass in a term or phrase and get it back true if it is on wikipedia, or
False if it is not.
"""
url = "http://en.wikipedia.org/w/api.php?action=opensearch&search=%s" % term
hits = _get_json(url)
if hits:
for hit in hits[1]:
if hit.lower() == word.lower():
return True
return False
def wikipedia_categories(term):
"""Pass wikipedia term and get back the categories it belongs to.
"""
url = "http://en.wikipedia.org/w/api.php?action=query&prop=categories&titles=%s&format=json&cllimit=50" % term
results = _get_json(url)
page_id = results['query']['pages'].keys()[0]
categories = []
for c in results['query']['pages'][page_id].get('categories', []):
categories.append(re.sub('^Category:', '', c['title']))
return categories
def _get_json(url):
"""utility to fetch and decode json
"""
try:
return json.loads(urllib.urlopen(url).read())
except ValueError, e:
logging.exception("bad JSON from %s", url)
except Exception, e:
logging.exception("unable to get %s", url)
return None