urbain_traffic_sao_paulo.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Sep 24 19:49:36 2022

@author: tarcisio
"""
import matplotlib.pyplot as plt
#open the data
traffic = pd.read_csv('/Users/tarcisio/Documents/Projects_python/behavior_urban_Traffic/Behavior of the urban traffic of the city of Sao Paulo in Brazil/Behavior of the urban traffic of the city of Sao Paulo in Brazil.csv', sep = ';')

#quick analyze
print(traffic.head(5))
print(traffic.tail(5))
traffic.info()

#describe method
traffic['Slowness in traffic (%)'].describe()

# clean the data
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].str.replace(',', '.')
traffic['Slowness in traffic (%)'] = traffic['Slowness in traffic (%)'].astype(float)

plt.hist(traffic['Slowness in traffic (%)'])
plt.show()


# series.plot.hist()
traffic['Slowness in traffic (%)'].plot.hist()
plt.xlabel('Slowness in traffic (%)')
plt.title('Distribution of Slowness in traffic (%)')
plt.show()
# The y-label "Frequency" was generated by default


# #Let's calculate the sums for all the incident columns so we can compare them (we start 
#with isolating the incident columns by dropping the columns 'Hour (Coded)' and 'Slowness in 
#traffic (%)').

# ele aqui simplesmente tirou as 2 colunas indicadas abaixo, para calcular a soma de cada incidente
incidents = traffic.drop(['Hour (Coded)', 'Slowness in traffic (%)'],
                        axis=1)
incidents.sum()
type(incidents.sum())
#This means that we can use the Series.plot.bar() method we mentioned on the previous screen:
incidents.sum().plot.barh()
plt.show()


#Dataframe.plot.scatter() method
traffic.plot.scatter(x='Slowness in traffic (%)', # lembrar de aqui tem que usar plot
                     y='Lack of electricity')
plt.show()

#

slowness_20_or_more = traffic[traffic["Slowness in traffic (%)"] >= 20]

incidents_20_or_more = slowness_20_or_more.drop(['Hour (Coded)', 'Slowness in traffic (%)'],
                        axis=1)
incident_frequencies = incidents_20_or_more.sum()

incident_frequencies.plot.barh() # lembrar que aqui tem que usar plot
plt.show()


# esse é foda - plota os 5 dias  
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days): # o zip faz 2 interações para i no range e para day em days
    each_day_traffic = traffic[i:i+27]
    traffic_per_day[day] = each_day_traffic
for day in days:
    traffic_per_day[day].plot.line(x='Hour (Coded)',
                                   y='Slowness in traffic (%)')
    plt.title(day)
    plt.ylim([0, 25])
    plt.show()

#outro exemplo, usando tudo dentro, nesse caso uso plt.plot, pq dataframes.plot não funciona para graficos no msm lugar    
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday']
traffic_per_day = {}
for i, day in zip(range(0, 135, 27), days): # o range é de 0 à 135 linhas, mas a cada 27 valores
    each_day_traffic = traffic[i:i+27]
    traffic_per_day[day] = each_day_traffic
    
for day in days:
    plt.plot(traffic_per_day[day]['Hour (Coded)'], traffic_per_day[day]['Slowness in traffic (%)'], label = day)
plt.title('Difference by day')
plt.legend()
plt.show()

#grid
plt.figure(figsize=(10,12))
for i, day in zip(range(1,6), days):
    plt.subplot(3, 2, i)
    plt.plot(traffic_per_day[day]['Hour (Coded)'],
        traffic_per_day[day]['Slowness in traffic (%)'])
    plt.title(day)
    plt.ylim([0,25])
#plt.show() para gerar um grid desses valores
# add um grafico a mais no meu grid
plt.subplot(3, 2, 6)
for day in days:
    plt.plot(traffic_per_day[day]['Hour (Coded)'],
             traffic_per_day[day]['Slowness in traffic (%)'],
             label=day)
    plt.ylim([0,25])

plt.legend()
plt.show()