Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP] Add datasets from "Encoding high-cardinality string categorical variables" paper #4

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/beer_reviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,5 +35,7 @@ def get_beer_reviews_df(save=True):
elt = elt.replace('\xa0', ' ')
arr.append(elt)
df[c] = pd.Series(arr, dtype=df[c].dtype, index=df.index)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
write_df(save, df, data_dir[1], BEER_REVIEWS_CONFIG.main_file)
return df
29 changes: 29 additions & 0 deletions src/building_permits.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import pandas as pd
import numpy as np
import kaggle
import re

def get_building_permits_df():
kaggle.api.authenticate()
kaggle.api.dataset_download_files('chicago/chicago-building-permits',
path='data/building_permits/raw', unzip=True)
# dataset update daily.

csv_path = 'data/building_permits/raw/building-permits.csv'
df = pd.read_csv(csv_path, low_memory=False)
df.columns = df.columns.str.strip()
df['PERMIT#'] = df['PERMIT#'].astype(str)
for col in df.columns:
if 'ZIPCODE' in col: # zip code may contain '-'
df[col] = df[col].astype(str)
df['ESTIMATED_COST'] = (
df['REPORTED_COST'].astype(float) + 1E-10).apply(np.log)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
for col in df:
if 'contact_' in col:
df[col] = df[col].astype(str)
df['work_description'] = df['work_description'].astype('category')
df.rename(columns={col: re.sub('@|:|#', '', col) for
col in df.columns}, inplace=True)
return df
26 changes: 26 additions & 0 deletions src/cacao_flavor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import re
import pandas as pd


def get_cacao_flavor_df():
# !kaggle datasets download rtatman/chocolate-bar-ratings/ -p data/cacao_flavor/raw --unzip
csv_path = 'data/cacao_flavor/raw/flavors_of_cacao.csv'
df = pd.read_csv(csv_path)
df.rename(columns={col: re.sub('\xa0', ' ', col) for
col in df.columns}, inplace=True)
df.rename(columns={col: re.sub('\n', '_', col) for
col in df.columns}, inplace=True)
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
df["broad_bean_origin"] = df["broad_bean_origin"].astype('category')

for col in ['company__(maker-if_known)',
'specific_bean_origin_or_bar_name',
'broad_bean_origin']:
if col == 'broad_bean_origin':
df[col].fillna('\xa0', inplace=True) #replace one nan by space
df[col] = [re.sub('&','et',s) for s in list(df[col])]
df[col] = [re.sub('\xa0',' ',s) for s in list(df[col])]
# drop the only row with missing label
df.drop(index = df.index[df['bean_type'].isna()], inplace=True)
return df
11 changes: 9 additions & 2 deletions src/colleges.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from collections import namedtuple
import re

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -64,6 +65,12 @@ def get_colleges_df(save=True):
cats = ['State', 'Predominant Degree', 'Highest Degree', 'Ownership', 'Region', 'ZIP']
for c in cats:
df[c] = df[c].astype('category')

write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
# write_df(save, df, data_dir[1], COLLEGES_CONFIG.main_file)
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)

# drop rows with missing label
df.drop(index = df.index[df['percent_pell_grant'].isna()], inplace=True)
return df
17 changes: 12 additions & 5 deletions src/crime_data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from collections import namedtuple
import re

import pandas as pd

Expand All @@ -24,13 +25,17 @@


def get_crime_df(save=True):
data_dir = fetch(CRIME_DATA_CONFIG)
file = os.listdir(data_dir[0])[0]
csv_path = os.path.join(data_dir[0], file)
# FIXME dead link :s
# data_dir = fetch(CRIME_DATA_CONFIG)
# data_dir = "/home/thomas/Documents/datasets/src/data/dragostore/crime_data/"
# file = os.listdir(data_dir[0])[0]
# csv_path = os.path.join(data_dir[0], file)
csv_path = "/home/thomas/Documents/datasets/src/data/dragostore/crime_data/Crime_Data_from_2010_to_Present.csv"
df = pd.read_csv(csv_path)

cols = ['Area Name', 'Victim Sex', 'Victim Descent', 'Premise Description', 'Weapon Description',
'Status Description', 'Crime Code Description']

df['Victim Age'] = float_to_int(df['Victim Age'], df.index)
df['Premise Code'] = float_to_int(df['Premise Code'], df.index)
df['Weapon Used Code'] = float_to_int(df['Weapon Used Code'], df.index)
Expand All @@ -42,6 +47,8 @@ def get_crime_df(save=True):
if df[c].dtype == float:
df[c] = float_to_int(df[c], df.index)
df[c] = df[c].astype('category')

write_df(save, df, data_dir[1], CRIME_DATA_CONFIG.main_file)
df.rename(columns={col: re.sub(' ', '_', col) for
col in df.columns}, inplace=True)
# drop rows with missing label
df.drop(index = df.index[df['Victim_Age'].isna()], inplace=True)
return df
48 changes: 48 additions & 0 deletions src/dataset_id.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# link dataset name to their id in openml.org

# this list of datasets are from "Encoding high-cardinality string categorical variables"
# https://arxiv.org/pdf/1907.01860.pdf

dataset_id = {
"building_permits" : None,
"beer_reviews" : 42088,
"colleges" : 42159,
"crime_data" : 42160,
"drug_directory" : None,
"employe_salaries" : 42125,
"federal_election" : 42080,
"journal_influence" : 42123,
"kickstarter_projects" : 42076,
"medical_charge" : 42131,
"met_objects" : None,
"midwest_survey" : None,
"open_payment" : None,
"public_procurement" : 42163,
"road_safety" : None,
"traffic_violations" : 42132,
"vancouver_employee" : 42089,
"wine_reviews" : 42074,

"adult" : 1590,
"cacao_flavor" : 42166,
"california_housing" : 537,
"dating_profile" : 42164,
"house_prices" : 42165,
"house_sales" : 42092,
"intrusion_detection" : 1113,
}

from sklearn.datasets import fetch_openml

def test_fetch_openml():
# fetch all dataset currently available
# required sklearn >= 0.22 !!

for name, id in dataset_id.items():
if id is not None:
print('fetching ', name,' ...')
data = fetch_openml(data_id = id, as_frame=True)
print('- data shape:', data['data'].shape)



33 changes: 33 additions & 0 deletions src/dating_profile.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@


import os
from collections import namedtuple
import re

import pandas as pd

DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])

DATING_PROFILE_CONFIG = DatasetInfo(
name='dating_profile',
urlinfos=(
UrlInfo(
url="https://github.com/rudeboybert/JSE_OkCupid/raw/master/profiles.csv.zip",
filenames=(
"profile.csv",
), uncompress=False
),
),
main_file="profile.csv",
source="https://github.com/rudeboybert/JSE_OkCupid/raw/master/profiles.csv.zip"
)


def get_dating_profile_df(save=True):

df = pd.read_csv('data/profiles/profiles.csv', sep=',', encoding='latin1')
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)

return df
40 changes: 40 additions & 0 deletions src/drug_discovery.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import os
from collections import namedtuple
import re

import pandas as pd

from common.file_management import fetch, write_df

DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])

DRUG_DISCOVERY_CONFIG = DatasetInfo(
name='drug_discovery',
urlinfos=(
UrlInfo(
url="https://www.accessdata.fda.gov/cder/ndctext.zip",
filenames=(
"product.txt",
), uncompress=False
),
),
main_file="product.txt",
source="https://www.fda.gov/drugs/drug-approvals-and-databases/national-drug-code-directory"
)


def get_drug_discovery_df(save=True):
data_dir = fetch(DRUG_DISCOVERY_CONFIG)
file = os.listdir(data_dir[0])[1]
csv_path = os.path.join(data_dir[0], file)
df = pd.read_csv(csv_path, sep='\t', encoding='latin1')
# link data have changed...
return df
cat_cols = ['DRG Definition', 'Provider State']
for c in cat_cols:
df[c] = df[c].astype('category')
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
write_df(save, df, data_dir[1], DRUG_DISCOVERY_CONFIG.main_file)
return df
3 changes: 3 additions & 0 deletions src/employee_salaries.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime
import os
from collections import namedtuple
import re

import pandas as pd

Expand Down Expand Up @@ -35,4 +36,6 @@ def get_employee_salaries_df(save=True):
df['Department Name'] = df['Department Name'].astype('category')
df['Assignment Category'] = df['Assignment Category'].astype('category')
write_df(save, df, data_dir[1], EMPLOYEE_SALARIES_CONFIG.main_file)
df.rename(columns={col: re.sub(' ', '_', col).lower() for
col in df.columns}, inplace=True)
return df
67 changes: 67 additions & 0 deletions src/federal_election.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
import os
import re
from collections import namedtuple

import pandas as pd
import numpy as np

from common.file_management import fetch, write_df, float_to_int

DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])

FEDERAL_ELECTION_CONFIG = DatasetInfo(
name='federal_election',
urlinfos=(
UrlInfo(
url='https://cg-519a459a-0ea3-42c2-b7bc-fa1143481f74.s3-us-gov-west-1.amazonaws.com/bulk-downloads/2012/indiv12.zip',
filenames=(
"itcont.txt",
), uncompress=True
),
),
main_file="itcont.txt",
source="https://classic.fec.gov/finance/disclosure/ftpdet.shtml"
)

FEDERAL_ELECTION_HEADER_CONFIG = DatasetInfo(
name='federal_election',
urlinfos=(
UrlInfo(
url='https://classic.fec.gov/finance/disclosure/metadata/indiv_header_file.csv',
filenames=(
"indiv_header_file.csv",
), uncompress=False
),
),
main_file="indiv_header_file.csv",
source="https://classic.fec.gov/finance/disclosure/metadata/DataDictionaryContributionsbyIndividuals.shtml"
)


def get_federal_election_df(save=True):
# data
data_dir = fetch(FEDERAL_ELECTION_CONFIG)
file = "itcont.txt"
csv_path = os.path.join(data_dir[0], file)
# header
data_dir_header = fetch(FEDERAL_ELECTION_HEADER_CONFIG)
file_header = "indiv_header_file.csv"
csv_path_header = os.path.join(data_dir_header[0], file_header)

df_header = pd.read_csv(csv_path_header)
df = pd.read_csv(csv_path, sep='|', encoding='latin1',
header=None, names=df_header.columns)
# Some donations are negative
df['TRANSACTION_AMT'] = df['TRANSACTION_AMT'].abs()
# Predicting the log of the donation
df['TRANSACTION_AMT'] = df[
'TRANSACTION_AMT'].apply(np.log)
df = df[df['TRANSACTION_AMT'] > 0]
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
df['zip_code'] = df['zip_code'].astype(str)
df['city'].loc[1378568] = re.sub('{', '', df['city'].loc[1378568])
df['memo_text'] = df['memo_text'].astype('category')
write_df(save, df, data_dir[1], FEDERAL_ELECTION_CONFIG.main_file)
return df
33 changes: 33 additions & 0 deletions src/house_price.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@


import os
from collections import namedtuple
import re

import pandas as pd

DatasetInfo = namedtuple('DatasetInfo', ['name', 'urlinfos', 'main_file', 'source'])
UrlInfo = namedtuple('UrlInfo', ['url', 'filenames', 'uncompress'])

HOUSE_PRICE_CONFIG = DatasetInfo(
name='house_price',
urlinfos=(
UrlInfo(
url="https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview",
filenames=(
"profile.csv",
), uncompress=False
),
),
main_file="train_test.csv",
source="https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview"
)


def get_house_price_df(save=True):

df = pd.read_csv('data/house-prices/train.csv')
df.rename(columns={col: re.sub(' ', '_', col) for
col in df.columns}, inplace=True)

return df
19 changes: 19 additions & 0 deletions src/house_sales.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
import pandas as pd
import numpy as np
import kaggle


def get_house_sales_df():
kaggle.api.authenticate()
kaggle.api.dataset_download_files('harlfoxem/housesalesprediction',
path='data/house_sales/raw', unzip=True)

csv_path = 'data/house_sales/raw/kc_house_data.csv'
df = pd.read_csv(csv_path)
df = pd.read_csv(csv_path, index_col=0)
df.rename(columns={col: col.lower() for
col in df.columns}, inplace=True)
print(df.columns)
df['zipcode'] = df['zipcode'].astype(str)
df['zipcode'] = df['zipcode'].astype('category')
return df
Loading