This repository has been archived by the owner on Feb 22, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathexplore.py
90 lines (74 loc) · 2.35 KB
/
explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Z0096
import pandas as pd
import numpy as np
from acquire import get_data
from prepare import encode, impute_mean, split_df
#################### Explore telco_churn Data ####################
# assign list of all columns for DataFrame
cols = [
# customer demographics
'is_female',
'is_senior',
'has_partner',
'has_dependent',
# phone service status
'has_phone',
'one_line',
'multiple_lines',
# internet service status
'has_internet',
'dsl',
'fiber',
# internet options
'streaming_tv',
'streaming_movies',
'online_security',
'online_backup',
'device_protection',
'tech_support',
# service charges
'monthly_charges',
'total_charges',
# payment information
'mailed_check',
'electronic_check',
'bank_transfer',
'credit_card',
'paperless_billing',
'autopay',
# subscription information
'no_contract',
'tenure'
]
def explore_data(columns=cols, cache=False):
'''
Create a basic DataFrame for purposes of exploration, uses same dataset as
would be created for model training
columns=cols default behavior, pass list of columns to specify only
certain columns, otherwise all columns are retained
cache=False default behavior, set true to force write new CSV
file, otherwise cached version is used
'''
# read in data to DataFrame
df = get_data(cache=cache)
# fill missing values in total_charges
df = impute_mean(df)
# set boolean values for true/false columns
df['one_line'] = np.where(df.multiple_lines == 'No', 1, 0)
df['dsl'] = np.where(df.internet_service_type_id == 1, 1, 0)
df['mailed_check'] = np.where(df.payment_type_id == 2, 1, 0)
df['no_contract'] = np.where(df.contract_type_id == 1, 1, 0)
df = encode(df)
# set desired or default DataFrame columns
df = pd.concat((df[columns], df['churn']), axis=1)
# obtain training dataset for exploration
subset, _, _, = split_df(df)
print(f'''
Data Processing Complete
+----------------------------------------+
| Source DataFrame Shape : {df.shape[0]} x {df.shape[1]:<5}|
| Subset DataFrame Shape : {subset.shape[0]} x {subset.shape[1]:<5}|
| Data Percentage Used : {subset.shape[0] / df.shape[0]:<12.2%}|
+----------------------------------------+
''')
return subset