-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape.py
112 lines (82 loc) · 3.19 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/env python
import requests
import pandas as pd
from datetime import date
from warnings import warn
import re
from sqlalchemy import create_engine
base_data_url = 'http://static.politico.com/mapdata/2016/'
engine = create_engine('sqlite:///primary_data.db')
raw_races = (
requests.get(
'http://static.politico.com/mapdata/2016/config_primaries.json'
)
.json()['races']
)
df = pd.DataFrame(raw_races).T
df.date = pd.to_datetime(df.date, format='%Y%m%d')
races = (
df.loc[(df.date <= date.today()) & (df.officeID == 'P'),
['stateAbb', 'date']]
.values
)
def format_state_date(state, date):
'''Take a state abbreviation and date and return them in string formatted
for get_results().
state (string): a two-letter state abbreviation.
date (datetime.date-like): a datetime.date.strftime()-mungeable date.'''
return '%s_%s' % (state.upper(), date.strftime('%Y%m%d'))
def get_results(state_date, base_data_url=base_data_url):
'''Fetch raw results data for date and state.
state_date (string): state & date separated by underscore
(e.g., "IA_20160201")
base_data_url (string): "endpoint" to find raw results xml at.
defaults to "http://static.politico.com/mapdata/2016/"'''
url = requests.compat.urljoin(base_data_url, '%s.xml' % state_date)
response = requests.get(url)
try:
return response.text
except Exception as e:
warn('Error encountered: %s' % e.message)
return None
def parse_line(line, candidate_names=None):
'''Parse line in raw results.
line (string): the line to parse.'''
meta, data = line.split('||')
if candidate_names:
pattern = re.compile('|'.join(candidate_names.keys()))
data = pattern.sub(lambda x: candidate_names[x.group()], data)
df = pd.DataFrame(
[pd.Series(meta.split(';') + d.split(';')) for d in data.split('|')]
)
return df
def parse_results(results):
'''Parse raw results.
results (string): raw results.'''
meta, data = results.split('\n\n')
lines = [l for l in data.split('\n') if l != '']
candidates = {
c.split(';')[0]: ', '.join(re.sub(r';;|;$', '', c).split(';')[1:])
for c in meta.split('\n')[1].split('|')
}
results = pd.concat([parse_line(l, candidates) for l in lines],
ignore_index=True)
return results
def process_race(state, date):
'''Take a state abbreviation and a date, fetch and process raw results.
state (string): a two-letter state abbreviation.
date (datetime.date-like): a datetime.date.strftime()-mungeable date.'''
state_date = format_state_date(state, date)
raw_results = get_results(state_date)
parsed_data = parse_results(raw_results)
return parsed_data
results = pd.concat([process_race(s, d) for s, d in races],
ignore_index=True)
results.columns = [
'state', 'race_type', 'elec_type', 'fips', 'place_name', '_',
'percent_reporting', '_', '_', 'race_id', '_', '_',
'candidate', 'party', 'votes', 'vote_share', 'winner', '_',
'_', 'delegates_state', '_'
]
results.drop('_', axis=1, inplace=True)
results.to_sql('results', engine, if_exists='replace', index=False)