AaronWard · Benjamin-Pickers · Apr 15, 2021 · Apr 15, 2021
diff --git a/src/covidify/data_prep.py b/src/covidify/data_prep.py
@@ -19,13 +19,14 @@
 import pandas as pd
 from string import capwords
 from difflib import get_close_matches
-from datetime import datetime, date, time 
+from datetime import datetime, date, time
 
-from covidify.sources import github, wiki
+from covidify.sources.data_adapter import PandasDataAdapter
 from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA
 from covidify.utils.utils import replace_arg_score
 
 
+
 args = docopt.docopt(__doc__)
 out = args['--output_folder']
 country = args['--country']
@@ -34,6 +35,9 @@
 
 
 ############ DATA SELECTION ############
+#Initialize an instance of the adapter
+github_data = PandasDataAdapter()
+
 
 if '_' in country:
     country = replace_arg_score(country)
@@ -42,22 +46,22 @@
     country = None
 
 if source == 'JHU':
-    df = github.get()
-    
+    df = github_data.get()
+
 elif source == 'wiki':
     print('Apologies, the wikipedia source is not ready yet - getting github data')
-    df = github.get()
-    
+    df = github_data.get()
+
 
 
 ############ COUNTRY SELECTION ############
 
 def get_similar_countries(c, country_list):
     pos_countries = get_close_matches(c, country_list)
-    
+
     if len(pos_countries) > 0:
         print('\033[1;31m'+c, 'was not listed. did you mean', pos_countries[0].capitalize() + '?\033[0;0m')
-        
+
         #Only delete if its a covidify generated folder
         if 'Desktop/covidify-output-' in out:
             os.system('rm -rf ' + out)
@@ -67,13 +71,13 @@ def get_similar_countries(c, country_list):
         if 'Desktop/covidify-output-' in out:
             os.system('rm -rf ' + out)
         sys.exit(1)
-        
+
 def check_specified_country(df, country):
     '''
     let user filter reports by country, if not found
     then give a option if the string is similar
     '''
-    
+
     # Get all unique countries in the data
     country_list = list(map(lambda x:x.lower().strip(), set(df.country.values)))
 
@@ -85,7 +89,7 @@ def check_specified_country(df, country):
         # give similar option if similarity found
         if country.lower() not in country_list:
             get_similar_countries(country, country_list)
-            
+
         else:
             #Return filtered dataframe
             print('... filtering data for', country)
@@ -170,9 +174,9 @@ def get_top_countries(data):
     # Get top N infected countries
     tmp_df = data.copy()
     tmp_df = tmp_df[tmp_df.file_date == df.file_date.max()]
-    return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index 
-        
-TOP_N_COUNTRIES = get_top_countries(df)    
+    return tmp_df.groupby(['country']).agg({'confirmed': 'sum'}).sort_values('confirmed',ascending=False).head(top).index
+
+TOP_N_COUNTRIES = get_top_countries(df)
 
 tmp_df = df[df.country.isin(TOP_N_COUNTRIES)].copy()
 
@@ -188,18 +192,18 @@ def get_day_counts(d, country):
                                                 'deaths': 'sum'})
     result_df['date'] = data['file_date'].unique()
     result_df['country'] = country
-        
+
     result_df = result_df[result_df.confirmed >= 500]
     result_df.insert(loc=0, column='day', value=np.arange(len(result_df)))
     return result_df
 
 df_list = []
 
 for country in TOP_N_COUNTRIES:
-    print('   ...', country + ': ' +  str(tmp_df[(tmp_df.file_date == df.file_date.max()) & 
+    print('   ...', country + ': ' +  str(tmp_df[(tmp_df.file_date == df.file_date.max()) &
                                                  (tmp_df.country == country)].confirmed.sum()))
     df_list.append(get_day_counts(tmp_df[tmp_df.country == country], country))
-    
+
 log_df = pd.concat(df_list, axis=0, ignore_index=True)
 
 
@@ -227,4 +231,4 @@ def get_day_counts(d, country):
 log_df.astype(str).to_csv(os.path.join(save_dir, log_file_name))
 print('...', log_file_name)
 
-print('Done!')
+print('Done!')
diff --git a/src/covidify/list_countries.py b/src/covidify/list_countries.py
@@ -1,6 +1,6 @@
 '''
 This script is for listing countries that have cases of corona virus.
-This is so you can decide which country to make a report for. 
+This is so you can decide which country to make a report for.
 
 '''
 
@@ -9,17 +9,19 @@
 import click
 import covidify
 import numpy as np
-from covidify.sources import github
+from covidify.sources.data_adapter import PandasDataAdapter
 from covidify.config import SCRIPT
 
 def get_countries():
     print('Getting available countries...')
-    df = github.get()
+    #Initialize an instance of the adapter
+    github_data = PandasDataAdapter()
+    df = github_data.get()
     df = df[df.confirmed > 0]
 
     countries = sorted(list(set(df.country.values)))
 
     for a,b,c in zip(countries[::3],countries[1::3],countries[2::3]):
         print('{:<30}{:<30}{:<}'.format(a,b,c))
-        
-    print('\n\033[1;31mNUMBER OF COUNTRIES/AREAS INFECTED:\033[0;0m', len(countries))
+
+    print('\n\033[1;31mNUMBER OF COUNTRIES/AREAS INFECTED:\033[0;0m', len(countries))
diff --git a/src/covidify/sources/data_adapter.py b/src/covidify/sources/data_adapter.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+from __future__ import print_function
+import pandas as pd
+import re
+import os
+import sys
+import git
+import numpy as np
+from tqdm import tqdm
+from time import strftime
+from dateutil.parser import parse
+from datetime import datetime, date, time
+from covidify.config import REPO, TMP_FOLDER, TMP_GIT, DATA, KEEP_COLS, NUMERIC_COLS
+from data_sources_interface import DataInterface
+from github import Github
+
+class PandasDataAdapter(DataInterface):
+    '''
+    Transforms the CSV data sheets into a pandas dataframe and cleans the data
+    '''
+
+    def __init__(self):
+        pass
+
+    def clean_sheet_names(new_ranges):
+        # Remove all sheets that dont have a numeric header
+        return [x for x in new_ranges if re.search(r'\d', x)]
+
+    def clone_repo(TMP_FOLDER, REPO):
+        print('Cloning Data Repo...')
+        git.Git(TMP_FOLDER).clone(REPO)
+
+    def get_date(last_update):
+        return parse(str(last_update).split(' ')[0]).strftime("%Y-%m-%d")
+
+    def get_csv_date(f):
+        return get_date(f.split('.')[0] + ' ')
+
+
+    def fix_country_names(tmp_df):
+        '''
+        Cleaning up after JHU's bullshit data management
+        '''
+        # Asian Countries
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Mainland China'),'China', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Korea, South'),'South Korea', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of Korea'),'South Korea', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Hong Kong SAR'),'Hong Kong', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Taipei and environs'),'Taiwan', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Taiwan*'),'Taiwan', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Macao SAR'),'Macau', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Iran (Islamic Republic of)'),'Iran', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Viet Nam'),'Vietnam', tmp_df['country'])
+
+        #European Countries
+        tmp_df['country'] = np.where((tmp_df['country']  == 'UK'),'United Kingdom', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == ' Azerbaijan'),'Azerbaijan', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Bosnia and Herzegovina'),'Bosnia', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Czech Republic'),'Czechia', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of Ireland'),'Ireland', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'North Ireland'),'Ireland', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of Moldova'),'Moldova', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Russian Federation'),'Russia', tmp_df['country'])
+
+        #African Countries
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Congo (Brazzaville)'),'Congo', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Congo (Kinshasa)'),'Congo', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Republic of the Congo'),'Congo', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Gambia, The'),'Gambia', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'The Gambia'),'Gambia', tmp_df['country'])
+
+        # Western Countries
+        tmp_df['country'] = np.where((tmp_df['country']  == 'USA'),'America', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'US'),'America', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Bahamas, The'),'The Bahamas', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Bahamas'),'The Bahamas', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'st. Martin'),'Saint Martin', tmp_df['country'])
+        tmp_df['country'] = np.where((tmp_df['country']  == 'St. Martin'),'Saint Martin', tmp_df['country'])
+
+
+        # Others
+        tmp_df['country'] = np.where((tmp_df['country']  == 'Cruise Ship'),'Others', tmp_df['country'])
+
+        return tmp_df
+
+    # Now that we have all the data we now need to clean it
+    # - Fill null values
+    # - remore suspected values
+    # - change column names
+    def clean_data(df):
+        tmp_df = df.copy()
+
+        if 'Demised' in tmp_df.columns:
+            tmp_df.rename(columns={'Demised':'deaths'}, inplace=True)
+
+        if 'Country/Region' in tmp_df.columns:
+            tmp_df.rename(columns={'Country/Region':'country'}, inplace=True)
+
+        if 'Country_Region' in tmp_df.columns:
+            tmp_df.rename(columns={'Country_Region':'country'}, inplace=True)
+
+        if 'Province/State' in tmp_df.columns:
+            tmp_df.rename(columns={'Province/State':'province'}, inplace=True)
+
+        if 'Province_State' in tmp_df.columns:
+            tmp_df.rename(columns={'Province_State':'province'}, inplace=True)
+
+        if 'Last Update' in tmp_df.columns:
+            tmp_df.rename(columns={'Last Update':'datetime'}, inplace=True)
+
+        if 'Last_Update' in tmp_df.columns:
+            tmp_df.rename(columns={'Last_Update':'datetime'}, inplace=True)
+
+        #Lower case all col names
+        tmp_df.columns = map(str.lower, tmp_df.columns)
+
+        for col in tmp_df[NUMERIC_COLS]:
+            tmp_df[col] = tmp_df[col].fillna(0)
+            tmp_df[col] = tmp_df[col].astype(int)
+
+        return tmp_df
+
+    def get_data(cleaned_sheets):
+        all_csv = []
+        # Import all CSV's
+        for f in tqdm(sorted(cleaned_sheets), desc='... loading data: '):
+            if 'csv' in f:
+                try:
+                    tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last Update'])
+                except:
+                    # Temporary fix for JHU's bullshit data management
+                    tmp_df = pd.read_csv(os.path.join(DATA, f), index_col=None,header=0, parse_dates=['Last_Update'])
+
+                tmp_df = clean_data(tmp_df)
+                tmp_df['date'] = tmp_df['datetime'].apply(get_date) # remove time to get date
+                tmp_df['file_date'] = get_csv_date(f) #Get date of csv from file name
+                tmp_df = tmp_df[KEEP_COLS]
+                tmp_df['province'].fillna(tmp_df['country'], inplace=True) #If no region given, fill it with country
+                all_csv.append(tmp_df)
+
+        df_raw = pd.concat(all_csv, axis=0, ignore_index=True, sort=True)  # concatenate all csv's into one df
+        df_raw = fix_country_names(df_raw)    # Fix mispelled country names
+        df_raw = df_raw.sort_values(by=['datetime'])
+        return df_raw
+
+    def get(self):
+        github = Github()
+        cleaned_sheets = clean_sheet_names(github.get())
+        df = get_data(cleaned_sheets)
+        return df
diff --git a/src/covidify/sources/data_sources_interface.py b/src/covidify/sources/data_sources_interface.py
@@ -0,0 +1,10 @@
+import abc
+
+class DataInterface(object, metaclass=abc.ABCMeta):
+
+    def __init__(self):
+        pass
+
+    @abc.abstractmethod
+    def get():
+        raise NotImplementedError('User must define get()')