Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implementing Adapter Design Pattern for Github Data #90

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 22 additions & 18 deletions src/covidify/data_prep.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@
import pandas as pd
from string import capwords
from difflib import get_close_matches
from datetime import datetime, date, time
from datetime import datetime, date, time

from covidify.sources import github, wiki
from covidify.sources.data_adapter import PandasDataAdapter
from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA
from covidify.utils.utils import replace_arg_score



args = docopt.docopt(__doc__)
out = args['--output_folder']
country = args['--country']
Expand All @@ -34,6 +35,9 @@


############ DATA SELECTION ############
#Initialize an instance of the adapter
github_data = PandasDataAdapter()


if '_' in country:
country = replace_arg_score(country)
Expand All @@ -42,22 +46,22 @@
country = None

if source == 'JHU':
df = github.get()
df = github_data.get()

elif source == 'wiki':
print('Apologies, the wikipedia source is not ready yet - getting github data')
df = github.get()
df = github_data.get()



############ COUNTRY SELECTION ############

def get_similar_countries(c, country_list):
pos_countries = get_close_matches(c, country_list)

if len(pos_countries) > 0:
print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m')

#Only delete if its a covidify generated folder
if 'Desktop/covidify-output-' in out:
os.system('rm -rf ' + out)
Expand All @@ -67,13 +71,13 @@ def get_similar_countries(c, country_list):
if 'Desktop/covidify-output-' in out:
os.system('rm -rf ' + out)
sys.exit(1)

def check_specified_country(df, country):
'''
let user filter reports by country, if not found
then give a option if the string is similar
'''

# Get all unique countries in the data
country_list = list(map(lambda x:x.lower().strip(), set(df.country.values)))

Expand All @@ -85,7 +89,7 @@ def check_specified_country(df, country):
# give similar option if similarity found
if country.lower() not in country_list:
get_similar_countries(country, country_list)

else:
#Return filtered dataframe
print('... filtering data for', country)
Expand Down Expand Up @@ -170,9 +174,9 @@ def get_top_countries(data):
# Get top N infected countries
tmp_df = data.copy()
tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()]
return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index
TOP_N_COUNTRIES = get_top_countries(df)
return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index

TOP_N_COUNTRIES = get_top_countries(df)

tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy()

Expand All @@ -188,18 +192,18 @@ def get_day_counts(d, country):
'deaths': 'sum'})
result_df['date'] = data['file_date'].unique()
result_df['country'] = country

result_df = result_df[result_df.confirmed >= 500]
result_df.insert(loc=0, column='day', value=np.arange(len(result_df)))
return result_df

df_list = []

for country in TOP_N_COUNTRIES:
print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) &
print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) &
(tmp_df.country == country)].confirmed.sum()))
df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country))

log_df = pd.concat(df_list, axis=0, ignore_index=True)


Expand Down Expand Up @@ -227,4 +231,4 @@ def get_day_counts(d, country):
log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name))
print('...', log_file_name)

print('Done!')
print('Done!')
12 changes: 7 additions & 5 deletions src/covidify/list_countries.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
'''
This script is for listing countries that have cases of corona virus.
This is so you can decide which country to make a report for.
This is so you can decide which country to make a report for.

'''

Expand All @@ -9,17 +9,19 @@
import click
import covidify
import numpy as np
from covidify.sources import github
from covidify.sources.data_adapter import PandasDataAdapter
from covidify.config import SCRIPT

def get_countries():
print('Getting available countries...')
df = github.get()
#Initialize an instance of the adapter
github_data = PandasDataAdapter()
df = github_data.get()
df = df[df.confirmed > 0]

countries = sorted(list(set(df.country.values)))

for a,b,c in zip(countries[::3],countries[1::3],countries[2::3]):
print('{:<30}{:<30}{:<}'.format(a,b,c))
print('\n\033[1;31mNUMBER OF COUNTRIES/AREAS INFECTED:\033[0;0m', len(countries))

print('\n\033[1;31mNUMBER OF COUNTRIES/AREAS INFECTED:\033[0;0m', len(countries))
150 changes: 150 additions & 0 deletions src/covidify/sources/data_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
#!/usr/bin/env python3
from __future__ import print_function
import pandas as pd
import re
import os
import sys
import git
import numpy as np
from tqdm import tqdm
from time import strftime
from dateutil.parser import parse
from datetime import datetime, date, time
from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA, KEEP_COLS, NUMERIC_COLS
from data_sources_interface import DataInterface
from github import Github

class PandasDataAdapter(DataInterface):
'''
Transforms the CSV data sheets into a pandas dataframe and cleans the data
'''

def __init__(self):
pass

def clean_sheet_names(new_ranges):
# Remove all sheets that dont have a numeric header
return [x for x in new_ranges if re.search(r'\d', x)]

def clone_repo(TMP_FOLDER, REPO):
print('Cloning Data Repo...')
git.Git(TMP_FOLDER).clone(REPO)

def get_date(last_update):
return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d")

def get_csv_date(f):
return get_date(f.split('.')[0] + ' ')


def fix_country_names(tmp_df):
'''
Cleaning up after JHU's bullshit data management
'''
# Asian Countries
tmp_df['country'] = np.where((tmp_df['country'] == 'Mainland China'),'China', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Korea, South'),'South Korea', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Korea'),'South Korea', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Hong Kong SAR'),'Hong Kong', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Taipei and environs'),'Taiwan', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Taiwan*'),'Taiwan', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Macao SAR'),'Macau', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Iran (Islamic Republic of)'),'Iran', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Viet Nam'),'Vietnam', tmp_df['country'])

#European Countries
tmp_df['country'] = np.where((tmp_df['country'] == 'UK'),'United Kingdom', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == ' Azerbaijan'),'Azerbaijan', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Bosnia and Herzegovina'),'Bosnia', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Czech Republic'),'Czechia', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Ireland'),'Ireland', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'North Ireland'),'Ireland', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Moldova'),'Moldova', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Russian Federation'),'Russia', tmp_df['country'])

#African Countries
tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Brazzaville)'),'Congo', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Kinshasa)'),'Congo', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of the Congo'),'Congo', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Gambia, The'),'Gambia', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'The Gambia'),'Gambia', tmp_df['country'])

# Western Countries
tmp_df['country'] = np.where((tmp_df['country'] == 'USA'),'America', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'US'),'America', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas, The'),'The Bahamas', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas'),'The Bahamas', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'st. Martin'),'Saint Martin', tmp_df['country'])
tmp_df['country'] = np.where((tmp_df['country'] == 'St. Martin'),'Saint Martin', tmp_df['country'])


# Others
tmp_df['country'] = np.where((tmp_df['country'] == 'Cruise Ship'),'Others', tmp_df['country'])

return tmp_df

# Now that we have all the data we now need to clean it
# - Fill null values
# - remore suspected values
# - change column names
def clean_data(df):
tmp_df = df.copy()

if 'Demised' in tmp_df.columns:
tmp_df.rename(columns={'Demised':'deaths'}, inplace=True)

if 'Country/Region' in tmp_df.columns:
tmp_df.rename(columns={'Country/Region':'country'}, inplace=True)

if 'Country_Region' in tmp_df.columns:
tmp_df.rename(columns={'Country_Region':'country'}, inplace=True)

if 'Province/State' in tmp_df.columns:
tmp_df.rename(columns={'Province/State':'province'}, inplace=True)

if 'Province_State' in tmp_df.columns:
tmp_df.rename(columns={'Province_State':'province'}, inplace=True)

if 'Last Update' in tmp_df.columns:
tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True)

if 'Last_Update' in tmp_df.columns:
tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True)

#Lower case all col names
tmp_df.columns = map(str.lower, tmp_df.columns)

for col in tmp_df[NUMERIC_COLS]:
tmp_df[col] = tmp_df[col].fillna(0)
tmp_df[col] = tmp_df[col].astype(int)

return tmp_df

def get_data(cleaned_sheets):
all_csv = []
# Import all CSV's
for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '):
if 'csv' in f:
try:
tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update'])
except:
# Temporary fix for JHU's bullshit data management
tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update'])

tmp_df = clean_data(tmp_df)
tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date
tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name
tmp_df = tmp_df[KEEP_COLS]
tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country
all_csv.append(tmp_df)

df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) # concatenate all csv's into one df
df_raw = fix_country_names(df_raw) # Fix mispelled country names
df_raw = df_raw.sort_values(by=['datetime'])
return df_raw

def get(self):
github = Github()
cleaned_sheets = clean_sheet_names(github.get())
df = get_data(cleaned_sheets)
return df
10 changes: 10 additions & 0 deletions src/covidify/sources/data_sources_interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import abc

class DataInterface(object, metaclass=abc.ABCMeta):

def __init__(self):
pass

@abc.abstractmethod
def get():
raise NotImplementedError('User must define get()')
Loading