From a9f878db9c8a272ed614b99d5451ea7e54165876 Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 14 Apr 2021 20:29:04 -0400 Subject: [PATCH 1/2] Added an adapter class that cleans and transforms the data from github --- src/covidify/sources/data_adapter.py | 146 +++++++++++++++ .../sources/data_sources_interface.py | 10 ++ src/covidify/sources/github.py | 169 +++--------------- 3 files changed, 180 insertions(+), 145 deletions(-) create mode 100644 src/covidify/sources/data_adapter.py create mode 100644 src/covidify/sources/data_sources_interface.py diff --git a/src/covidify/sources/data_adapter.py b/src/covidify/sources/data_adapter.py new file mode 100644 index 0000000..bb36a01 --- /dev/null +++ b/src/covidify/sources/data_adapter.py @@ -0,0 +1,146 @@ +#!/usr/bin/env python3 +from __future__ import print_function +import pandas as pd +import re +import os +import sys +import git +import numpy as np +from tqdm import tqdm +from time import strftime +from dateutil.parser import parse +from datetime import datetime, date, time +from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA, KEEP_COLS, NUMERIC_COLS +from data_sources_interface import DataInterface +from github import Github + +class DataAdapter(DataInterface): + + def __init__(self): + pass + + def clean_sheet_names(new_ranges): + # Remove all sheets that dont have a numeric header + return [x for x in new_ranges if re.search(r'\d', x)] + + def clone_repo(TMP_FOLDER, REPO): + print('Cloning Data Repo...') + git.Git(TMP_FOLDER).clone(REPO) + + def get_date(last_update): + return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + + def get_csv_date(f): + return get_date(f.split('.')[0] + ' ') + + + def fix_country_names(tmp_df): + ''' + Cleaning up after JHU's bullshit data management + ''' + # Asian Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'Mainland China'),'China', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Korea, South'),'South Korea', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Korea'),'South Korea', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Hong Kong SAR'),'Hong Kong', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Taipei and environs'),'Taiwan', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Taiwan*'),'Taiwan', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Macao SAR'),'Macau', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Iran (Islamic Republic of)'),'Iran', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Viet Nam'),'Vietnam', tmp_df['country']) + + #European Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'UK'),'United Kingdom', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == ' Azerbaijan'),'Azerbaijan', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Bosnia and Herzegovina'),'Bosnia', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Czech Republic'),'Czechia', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Ireland'),'Ireland', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'North Ireland'),'Ireland', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Moldova'),'Moldova', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Russian Federation'),'Russia', tmp_df['country']) + + #African Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Brazzaville)'),'Congo', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Kinshasa)'),'Congo', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of the Congo'),'Congo', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Gambia, The'),'Gambia', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'The Gambia'),'Gambia', tmp_df['country']) + + # Western Countries + tmp_df['country'] = np.where((tmp_df['country'] == 'USA'),'America', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'US'),'America', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas, The'),'The Bahamas', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas'),'The Bahamas', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'st. Martin'),'Saint Martin', tmp_df['country']) + tmp_df['country'] = np.where((tmp_df['country'] == 'St. Martin'),'Saint Martin', tmp_df['country']) + + + # Others + tmp_df['country'] = np.where((tmp_df['country'] == 'Cruise Ship'),'Others', tmp_df['country']) + + return tmp_df + + # Now that we have all the data we now need to clean it + # - Fill null values + # - remore suspected values + # - change column names + def clean_data(df): + tmp_df = df.copy() + + if 'Demised' in tmp_df.columns: + tmp_df.rename(columns={'Demised':'deaths'}, inplace=True) + + if 'Country/Region' in tmp_df.columns: + tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) + + if 'Country_Region' in tmp_df.columns: + tmp_df.rename(columns={'Country_Region':'country'}, inplace=True) + + if 'Province/State' in tmp_df.columns: + tmp_df.rename(columns={'Province/State':'province'}, inplace=True) + + if 'Province_State' in tmp_df.columns: + tmp_df.rename(columns={'Province_State':'province'}, inplace=True) + + if 'Last Update' in tmp_df.columns: + tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) + + if 'Last_Update' in tmp_df.columns: + tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True) + + #Lower case all col names + tmp_df.columns = map(str.lower, tmp_df.columns) + + for col in tmp_df[NUMERIC_COLS]: + tmp_df[col] = tmp_df[col].fillna(0) + tmp_df[col] = tmp_df[col].astype(int) + + return tmp_df + + def get_data(cleaned_sheets): + all_csv = [] + # Import all CSV's + for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '): + if 'csv' in f: + try: + tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update']) + except: + # Temporary fix for JHU's bullshit data management + tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update']) + + tmp_df = clean_data(tmp_df) + tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date + tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name + tmp_df = tmp_df[KEEP_COLS] + tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country + all_csv.append(tmp_df) + + df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) # concatenate all csv's into one df + df_raw = fix_country_names(df_raw) # Fix mispelled country names + df_raw = df_raw.sort_values(by=['datetime']) + return df_raw + + def get(self): + github = Github() + cleaned_sheets = clean_sheet_names(github.get()) + df = get_data(cleaned_sheets) diff --git a/src/covidify/sources/data_sources_interface.py b/src/covidify/sources/data_sources_interface.py new file mode 100644 index 0000000..5f14c47 --- /dev/null +++ b/src/covidify/sources/data_sources_interface.py @@ -0,0 +1,10 @@ +import abc + +class DataInterface(object, metaclass=abc.ABCMeta): + + def __init__(self): + pass + + @abc.abstractmethod + def get(): + raise NotImplementedError('User must define get()') diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py index 38c7a71..03f0bb8 100644 --- a/src/covidify/sources/github.py +++ b/src/covidify/sources/github.py @@ -9,161 +9,40 @@ from tqdm import tqdm from time import strftime from dateutil.parser import parse -from datetime import datetime, date, time +from datetime import datetime, date, time from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA, KEEP_COLS, NUMERIC_COLS +from data_sources_interface import DataInterface -def clean_sheet_names(new_ranges): - # Remove all sheets that dont have a numeric header - return [x for x in new_ranges if re.search(r'\d', x)] +class Github(DataInterface): -def clone_repo(TMP_FOLDER, REPO): - print('Cloning Data Repo...') - git.Git(TMP_FOLDER).clone(REPO) + data_sheet = None -def get_date(last_update): - return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d") + def __init__(self): + # Create Tmp Folder + if not os.path.isdir(TMP_FOLDER): + print('Creating folder...') + print('...', TMP_FOLDER) + os.mkdir(TMP_FOLDER) -def get_csv_date(f): - return get_date(f.split('.')[0] + ' ') - - -def fix_country_names(tmp_df): - ''' - Cleaning up after JHU's bullshit data management - ''' - # Asian Countries - tmp_df['country'] = np.where((tmp_df['country'] == 'Mainland China'),'China', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Korea, South'),'South Korea', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Korea'),'South Korea', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Hong Kong SAR'),'Hong Kong', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Taipei and environs'),'Taiwan', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Taiwan*'),'Taiwan', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Macao SAR'),'Macau', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Iran (Islamic Republic of)'),'Iran', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Viet Nam'),'Vietnam', tmp_df['country']) - - #European Countries - tmp_df['country'] = np.where((tmp_df['country'] == 'UK'),'United Kingdom', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == ' Azerbaijan'),'Azerbaijan', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Bosnia and Herzegovina'),'Bosnia', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Czech Republic'),'Czechia', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Ireland'),'Ireland', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'North Ireland'),'Ireland', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of Moldova'),'Moldova', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Russian Federation'),'Russia', tmp_df['country']) - - #African Countries - tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Brazzaville)'),'Congo', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Congo (Kinshasa)'),'Congo', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Republic of the Congo'),'Congo', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Gambia, The'),'Gambia', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'The Gambia'),'Gambia', tmp_df['country']) - - # Western Countries - tmp_df['country'] = np.where((tmp_df['country'] == 'USA'),'America', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'US'),'America', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas, The'),'The Bahamas', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'Bahamas'),'The Bahamas', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'st. Martin'),'Saint Martin', tmp_df['country']) - tmp_df['country'] = np.where((tmp_df['country'] == 'St. Martin'),'Saint Martin', tmp_df['country']) - - - # Others - tmp_df['country'] = np.where((tmp_df['country'] == 'Cruise Ship'),'Others', tmp_df['country']) - - return tmp_df - -# Now that we have all the data we now need to clean it -# - Fill null values -# - remore suspected values -# - change column names -def clean_data(df): - tmp_df = df.copy() - - if 'Demised' in tmp_df.columns: - tmp_df.rename(columns={'Demised':'deaths'}, inplace=True) - - if 'Country/Region' in tmp_df.columns: - tmp_df.rename(columns={'Country/Region':'country'}, inplace=True) - - if 'Country_Region' in tmp_df.columns: - tmp_df.rename(columns={'Country_Region':'country'}, inplace=True) - - if 'Province/State' in tmp_df.columns: - tmp_df.rename(columns={'Province/State':'province'}, inplace=True) - - if 'Province_State' in tmp_df.columns: - tmp_df.rename(columns={'Province_State':'province'}, inplace=True) - - if 'Last Update' in tmp_df.columns: - tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True) - - if 'Last_Update' in tmp_df.columns: - tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True) - - #Lower case all col names - tmp_df.columns = map(str.lower, tmp_df.columns) - - for col in tmp_df[NUMERIC_COLS]: - tmp_df[col] = tmp_df[col].fillna(0) - tmp_df[col] = tmp_df[col].astype(int) - - return tmp_df - -def get_data(cleaned_sheets): - all_csv = [] - # Import all CSV's - for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '): - if 'csv' in f: + #Check if repo exists + #git pull if it does + if not os.path.isdir(TMP_GIT): + clone_repo(TMP_FOLDER, REPO) + else: try: - tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update']) + print('git pull from', REPO) + rep = git.Repo(TMP_GIT) + rep.remotes.origin.pull() except: - # Temporary fix for JHU's bullshit data management - tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update']) - - tmp_df = clean_data(tmp_df) - tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date - tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name - tmp_df = tmp_df[KEEP_COLS] - tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country - all_csv.append(tmp_df) + print('Could not pull from', REPO) + sys.exit(1) - df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True) # concatenate all csv's into one df - df_raw = fix_country_names(df_raw) # Fix mispelled country names - df_raw = df_raw.sort_values(by=['datetime']) - return df_raw + data_sheet = os.listdir(DATA) -# use this function to fetch the data -def get(): - - # Create Tmp Folder - if not os.path.isdir(TMP_FOLDER): - print('Creating folder...') - print('...', TMP_FOLDER) - os.mkdir(TMP_FOLDER) - #Check if repo exists - #git pull if it does - if not os.path.isdir(TMP_GIT): - clone_repo(TMP_FOLDER, REPO) - else: - try: - print('git pull from', REPO) - rep = git.Repo(TMP_GIT) - rep.remotes.origin.pull() - except: - print('Could not pull from', REPO) - sys.exit(1) - sheets = os.listdir(DATA) - - # Clean the result to the sheet tabs we want - print('Getting sheets...') - cleaned_sheets = clean_sheet_names(sheets) + # use this function to fetch the data + def get(self): - # Aggregate all the data from sheets - df = get_data(cleaned_sheets) - - #Clean the column names - return df \ No newline at end of file + return self.data_sheet From 0f8a5f34134f0e2eb4720cea2d8fb40f517a4f7a Mon Sep 17 00:00:00 2001 From: Ben Date: Wed, 14 Apr 2021 20:43:10 -0400 Subject: [PATCH 2/2] Fixed files to now refer to the new adapter class --- src/covidify/data_prep.py | 40 +++++++++++++++------------- src/covidify/list_countries.py | 12 +++++---- src/covidify/sources/data_adapter.py | 6 ++++- src/covidify/sources/github.py | 3 +++ 4 files changed, 37 insertions(+), 24 deletions(-) diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py index 6153d44..ba3fe45 100644 --- a/src/covidify/data_prep.py +++ b/src/covidify/data_prep.py @@ -19,13 +19,14 @@ import pandas as pd from string import capwords from difflib import get_close_matches -from datetime import datetime, date, time +from datetime import datetime, date, time -from covidify.sources import github, wiki +from covidify.sources.data_adapter import PandasDataAdapter from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA from covidify.utils.utils import replace_arg_score + args = docopt.docopt(__doc__) out = args['--output_folder'] country = args['--country'] @@ -34,6 +35,9 @@ ############ DATA SELECTION ############ +#Initialize an instance of the adapter +github_data = PandasDataAdapter() + if '_' in country: country = replace_arg_score(country) @@ -42,22 +46,22 @@ country = None if source == 'JHU': - df = github.get() - + df = github_data.get() + elif source == 'wiki': print('Apologies, the wikipedia source is not ready yet - getting github data') - df = github.get() - + df = github_data.get() + ############ COUNTRY SELECTION ############ def get_similar_countries(c, country_list): pos_countries = get_close_matches(c, country_list) - + if len(pos_countries) > 0: print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m') - + #Only delete if its a covidify generated folder if 'Desktop/covidify-output-' in out: os.system('rm -rf ' + out) @@ -67,13 +71,13 @@ def get_similar_countries(c, country_list): if 'Desktop/covidify-output-' in out: os.system('rm -rf ' + out) sys.exit(1) - + def check_specified_country(df, country): ''' let user filter reports by country, if not found then give a option if the string is similar ''' - + # Get all unique countries in the data country_list = list(map(lambda x:x.lower().strip(), set(df.country.values))) @@ -85,7 +89,7 @@ def check_specified_country(df, country): # give similar option if similarity found if country.lower() not in country_list: get_similar_countries(country, country_list) - + else: #Return filtered dataframe print('... filtering data for', country) @@ -170,9 +174,9 @@ def get_top_countries(data): # Get top N infected countries tmp_df = data.copy() tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()] - return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index - -TOP_N_COUNTRIES = get_top_countries(df) + return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index + +TOP_N_COUNTRIES = get_top_countries(df) tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy() @@ -188,7 +192,7 @@ def get_day_counts(d, country): 'deaths': 'sum'}) result_df['date'] = data['file_date'].unique() result_df['country'] = country - + result_df = result_df[result_df.confirmed >= 500] result_df.insert(loc=0, column='day', value=np.arange(len(result_df))) return result_df @@ -196,10 +200,10 @@ def get_day_counts(d, country): df_list = [] for country in TOP_N_COUNTRIES: - print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) & + print(' ...', country + ': ' + str(tmp_df[(tmp_df.file_date == df.file_date.max()) & (tmp_df.country == country)].confirmed.sum())) df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country)) - + log_df = pd.concat(df_list, axis=0, ignore_index=True) @@ -227,4 +231,4 @@ def get_day_counts(d, country): log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name)) print('...', log_file_name) -print('Done!') \ No newline at end of file +print('Done!') diff --git a/src/covidify/list_countries.py b/src/covidify/list_countries.py index 8637489..eee03a8 100644 --- a/src/covidify/list_countries.py +++ b/src/covidify/list_countries.py @@ -1,6 +1,6 @@ ''' This script is for listing countries that have cases of corona virus. -This is so you can decide which country to make a report for. +This is so you can decide which country to make a report for. ''' @@ -9,17 +9,19 @@ import click import covidify import numpy as np -from covidify.sources import github +from covidify.sources.data_adapter import PandasDataAdapter from covidify.config import SCRIPT def get_countries(): print('Getting available countries...') - df = github.get() + #Initialize an instance of the adapter + github_data = PandasDataAdapter() + df = github_data.get() df = df[df.confirmed > 0] countries = sorted(list(set(df.country.values))) for a,b,c in zip(countries[::3],countries[1::3],countries[2::3]): print('{:<30}{:<30}{:<}'.format(a,b,c)) - - print('\n\033[1;31mNUMBER OF COUNTRIES/AREAS INFECTED:\033[0;0m', len(countries)) \ No newline at end of file + + print('\n\033[1;31mNUMBER OF COUNTRIES/AREAS INFECTED:\033[0;0m', len(countries)) diff --git a/src/covidify/sources/data_adapter.py b/src/covidify/sources/data_adapter.py index bb36a01..2019121 100644 --- a/src/covidify/sources/data_adapter.py +++ b/src/covidify/sources/data_adapter.py @@ -14,7 +14,10 @@ from data_sources_interface import DataInterface from github import Github -class DataAdapter(DataInterface): +class PandasDataAdapter(DataInterface): + ''' + Transforms the CSV data sheets into a pandas dataframe and cleans the data + ''' def __init__(self): pass @@ -144,3 +147,4 @@ def get(self): github = Github() cleaned_sheets = clean_sheet_names(github.get()) df = get_data(cleaned_sheets) + return df diff --git a/src/covidify/sources/github.py b/src/covidify/sources/github.py index 03f0bb8..126f94c 100644 --- a/src/covidify/sources/github.py +++ b/src/covidify/sources/github.py @@ -14,6 +14,9 @@ from data_sources_interface import DataInterface class Github(DataInterface): + ''' + Grabs Covid data from John Hopkins University Github repo, and stores data as a CSV + ''' data_sheet = None