-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_cleaning.py
92 lines (61 loc) · 2.46 KB
/
data_cleaning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import pandas as pd
import numpy as np
def load_clean_zillow_data():
'''
runs sequentially:
load_raw_zillow()
- loads zillow_data.csv
df = drop_df_columns(df)
- drops extraneous columns
df = melt_df(df)
- consolidates times/values into two columns 'time' and 'value'
df= zipcode_columns(df)
- pivots the df into a df of timeseries by zipcode
result:
zillow_data.csv loaded and cleaned
na values induced in the pivot step are retained, columns may still
be useful in some form as length of forecast is not yet decided
'''
df = load_raw_zillow()
df = drop_df_columns(df)
df = melt_df(df)
df = zipcode_columns(df)
return df
def drop_df_columns(frame):
'''
data cleaning: drop columns
input:
frame: dataframe
output: frame w/o the RegionID and SizeRank columns
'''
frame = frame.drop(['RegionID', 'SizeRank'], axis=1)
return frame
def melt_df(frame):
'''
data transformation: consolidate all date columns into one column,
values for each datetime for each RegionName into separate column
dropping na vals in the value column
input: dataframe
output: dataframe w/ datetime columns and values consolidated into two columns 'time' and 'value'
'''
melted = pd.melt(frame, id_vars=['RegionName', 'City', 'State', 'Metro', 'CountyName'], var_name='time')
melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True)
melted = melted.dropna(subset=['value'])
return melted
def zipcode_columns(frame):
'''
data transformation: pivot the zipcodes to columns yielding a dataframe of timeseries by zipcode
input: melted dataframe
output: dataframe of timeseries by zipcode
'''
zip_df = frame.pivot_table(index='time', columns='RegionName', values='value')
# Convert columns to string for easier index access, this also removes the RegionName column index
zip_df.columns = [str(x) for x in zip_df.columns]
return zip_df
def load_raw_zillow():
'''
loads in zillow_data.csv from github repo using pd.read_csv
outputs: dataframe of zillow_data.csv
'''
df = pd.read_csv('https://raw.githubusercontent.com/learn-co-students/dc-ds-060120/master/mod-4/week-3/Mod_4_Project/time-series/zillow_data.csv')
return df