pdsnd_github/bikeshare.py at master · maxysio/pdsnd_github · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
import time
import pandas as pd
import numpy as np
import calendar
from scipy import stats
import os

#global lists
city_list = {'chicago': 1, 'washington': 2, 'new york': 3, '1': 1, '2': 2, '3': 3}
city_file = {1: 'chicago.csv', 2: 'washington.csv', 3: 'new_york_city.csv'}
month_list = {'january': 1, 'february': 2, 'march': 3, 'april': 4, 'may': 5, 'june': 6, 'july': 7, 'august': 8, 'september': 9, 'october': 10, 'november': 11,
                'december': 12, 'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'jun': 6, 'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11,
                'dec': 12, '1': 1, '2': 2, '3': 3, '4': 4, '5': 5, '6': 6, '7': 7, '8': 8, '9': 9, '10': 10, '11': 11, '12': 12}
day_month_list = {'day': 1, 'month': 2, 'both': 3, 'none': 4, '1': 1, '2': 2, '3': 3, '4': 4}
day_list = {'sunday': 6, 'monday': 0, 'tuesday': 1, 'wednesday': 2, 'thursday': 3, 'friday': 4, 'saturday': 5, '1': 6, '2': 0, '3': 1, '4': 2,
                '5': 3, '6': 4, '7': 5, 'sun': 6, 'mon': 0, 'tue': 1, 'wed': 2, 'thu': 3, 'fri': 4, 'sat': 5}
city_name = {1: 'Chicago', 2: 'Washington DC', 3: 'New York City'}


def get_datafilters():
    city, month, day = -1, -1, -1

    #Get the city
    invalid_input = True
    while(invalid_input):
        city_name = input('\nWhich city would you like to see the data analysis for - Chicago(1), Washington(2) or New York(3)\n')
        city = city_list.get(city_name.lower(), -1)
        if(city>0):
            invalid_input = False
        else:
            print('That doesn\'t look like a valid selection. Lets try again')

    #Check how they want to filter
    invalid_input = True
    while(invalid_input):
        date_or_month = input('\nWould you like to filter by month(1), day(2), both(3) or none at all(4)\n')
        date_or_month = day_month_list.get(date_or_month.lower(), -1)
        if(date_or_month>0):
            invalid_input = False
        else:
            print('That doesn\'t look like a valid selection. Lets try again')

    #Get the month
    if(date_or_month == 1 or date_or_month == 3):
        invalid_input = True
        while(invalid_input):
            month = input('\nChoose a month you want to filter data by: January, February, March...etc. You can use numbers or 3 letter notations\n')
            month = month_list.get(month.lower(), -1)
            if(month>0):
                invalid_input = False
            else:
                print('That doesn\'t look like a valid selection. Lets try again')

    #Get the day
    if(date_or_month == 2 or date_or_month == 3):
        invalid_input = True
        while(invalid_input):
            day = input('\nChoose a day you want to filter data by: Sunday(1), Monday(2)...Saturday(7)\n')
            day = day_list.get(day.lower(), -1)
            if(day>-1):
                invalid_input = False
            else:
                print('That doesn\'t look like a valid selection. Lets try again')

    return city, month, day

def load_data(city, month, day):

    #get the current directory of the python script
    __location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))

    #filter by city first
    df = pd.read_csv(os.path.join(__location__, city_file.get(city)))

    # add columns for Month and Day of Week
    df['Month'] = pd.DatetimeIndex(pd.to_datetime(df['Start Time'], format='%Y-%m-%d %H:%M:%S')).month
    df['DayOfWeek'] = pd.DatetimeIndex(pd.to_datetime(df['Start Time'], format='%Y-%m-%d %H:%M:%S')).dayofweek

    #filter by month if applicable
    if(month>-1):
        df = df[df['Month'] == month]

    #filter by day if applicable
    if(day>-1):
        df = df[df['DayOfWeek'] == day]

    return df

def raw_data(df):
    show_raw_data = input('Would you like to see the raw data? (Yes(Y)/No(N)): ').lower()
    if(show_raw_data=='y' or show_raw_data=='yes'):
        number_of_lines = input('How many lines would you like to see? ')
        try:
            number_of_lines = int(number_of_lines)
        except ValueError:
            print('That does not look like a number, so we will show the first 5 lines')
            number_of_lines = 5

        print(df.head(number_of_lines))

def time_stats(df):
    print('-'*40)
    print('Time stats coming up')
    print('-'*20)
    # display the most common month
    print('The most popular month: ' + calendar.month_name[df['Month'].mode()[0]])

    # display the most common day of week
    print('The most popular day of the week: ' + calendar.day_name[df['DayOfWeek'].mode()[0]])

    # display the most common start hour
    df['Hour'] = pd.DatetimeIndex(pd.to_datetime(df['Start Time'], format='%Y-%m-%d %H:%M:%S')).hour
    most_common_hour = pd.to_datetime(str(df['Hour'].mode()[0]), format='%H')
    print('The most popular hour: ' + most_common_hour.strftime('%I %p'))

    print('-'*40)

def station_stats(df):
    print('-'*40)
    print('Station stats coming up')
    print('-'*20)

    #print most popular starting station
    print('The most popular starting station is: ' + df['Start Station'].mode()[0])

    #print most popular ending station
    print('The most popular ending station is: ' + df['End Station'].mode()[0])

    #print most popular trip
    df['Trip'] = df['Start Station'] + ' to ' + df['End Station']
    print('Most popular trip: ' + df['Trip'].mode()[0])

    print('-'*40)

def trip_duration_stats(df):
    print('-'*40)
    print('Trip Duration stats coming up')
    print('-'*20)

    #print Total Travel time
    h, m , s = 0, 0, 0
    m, s = divmod(df['Trip Duration'].sum(), 60)
    h, m = divmod(m, 60)
    p = 'Total travel time was: '
    if(h>0):
        p += str(h) + ' hours, '
    elif(m>0):
        p += str(m) + ' minutes, '
    print(p + str(s) + ' seconds')

    #print average Travel time
    h, m , s = 0, 0, 0
    m, s = divmod(df['Trip Duration'].mean(), 60)
    h, m = divmod(m, 60)
    p = 'Average travel time was: '
    if(h>0):
        p += str(h) + 'hours, '
    elif(m>0):
        p += str(m) + ' minutes, '
    print(p + str(s) + ' seconds')

    print('-'*40)

def user_stats(df, city):
    print('-'*40)
    print('User stats coming up')
    print('-'*20)

    #Counts of each user type
    df1 = df.groupby('User Type')['User Type'].count()
    if(df1.size>0):
        print('User Type ---> Number of Users')
        for x in range(df1.size):
            print(df1.index[x] + ' ---> ' + str(df1[x]))
        print('-'*20)

    #Following are applicable for Chicago and NYC
    if(city==1 or city==3):

        if(exit_app()):
            return
        else:
            #Group By Gender
            df2 = df.groupby('Gender')

            #Counts of each gender (only if NYC and Chicago)
            df3 = df2['Gender'].count()
            if(df3.size> 0):
                print('-'*20)
                print('Gender ---> Number of')
                print('-'*20)
                for x in range(df3.size):
                    print(df3.index[x] + ' ---> ' + str(df3[x]))
                print('-'*20)


        if(exit_app()):
            return
        else:
            #Earliest year of birth
            df3 = df2['Birth Year'].min()
            if(df3.size> 0):
                print('-'*20)
                print('Gender ---> Earliest Birth Year')
                print('-'*20)
                for x in range(df3.size):
                    print(df3.index[x] + ' ---> ' + str(df3[x]))
                print('-'*20)


        if(exit_app()):
            return
        else:
            #Most recent year of birth
            df3 = df2['Birth Year'].max()
            if(df3.size> 0):
                print('-'*20)
                print('Gender ---> Most Recent Birth Year')
                print('-'*20)
                for x in range(df3.size):
                    print(df3.index[x] + ' ---> ' + str(df3[x]))
                print('-'*20)


        if(exit_app()):
            return
        else:
            #Most common year of birth
            df3 = df2['Birth Year'].agg(lambda x: stats.mode(x))
            if(df3.size> 0):
                print('-'*20)
                print('Gender ---> Most Common Birth Year')
                print('-'*20)
                for x in range(df3.size):
                    print(df3.index[x] + ' ---> ' + str(df3[x][0]))
                print('-'*20)

def exit_app():
    done = input('Do you want to see stats or do you want to exit the application? Enter Yes or y to keep going: ').lower()
    if(done =='yes' or done =='y'):
        return False
    else:
        return True

def main():
    while True:
        #Get user input on how the data should be processed
        city, month, day = get_datafilters()

        print('Crunching data for the following: \nCity: {}'.format(city_name.get(city)))
        if(month>-1):
            print('Month: {}'.format(calendar.month_name[month]))
        if(day>-1):
            print('Day: {}'.format(calendar.day_name[day]))

        df = load_data(city, month, day)

        # Raw Data
        raw_data(df)

        if(exit_app()):
            break
        else:
            # Compute Popular times of travel
            time_stats(df)

        if(exit_app()):
            break
        else:
            # Popular stations and trip
            station_stats(df)

        if(exit_app()):
            break
        else:
            # Trip duration
            trip_duration_stats(df)

        if(exit_app()):
            break
        else:
            # User info
            user_stats(df, city)

        restart = input('\nWe have computed all the statistics. Would you like to start again or quit the application? Enter yes(y) or no(n) or pretty much anything other than y).\n')
        if restart.lower() != 'yes' and restart.lower() != 'y':
            break

    print('\nThank you for trying out the application. Hope to see you again\n')

if __name__ == "__main__":
	main()