Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 34 additions & 25 deletions trainer/lightgbm_main.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,14 @@

import lightgbm as lgb
import pandas as pd
import numpy as np

from trainer.cross_validation import cross_val_score
from sklearn.model_selection import StratifiedKFold
import trainer.lightgbm_functions as lf
import trainer.preprocessing as pp
import trainer.plotting_functions as myplot
from sklearn.metrics import roc_auc_score


# Default parameters
Expand Down Expand Up @@ -58,7 +61,7 @@


def lgb_cv(params, training_data, predictors, target, validation_data=None,
categorical_features=None, n_splits=5, early_stopping_rounds=20):
categorical_features=None, n_splits=3, early_stopping_rounds=20):
"""
Returns the average score after performing cross validation on
`training_data` with `n_splits` splits. At each iteration, LightDBM
Expand Down Expand Up @@ -88,13 +91,28 @@ def lgb_cv(params, training_data, predictors, target, validation_data=None,

# Run k-fold cross-validation
logging.info('Running cross validation...')
scores = []
skf = StratifiedKFold(n_splits=n_splits, random_state=1)
scores = cross_val_score(gbm, training_data[predictors].values,
training_data[target].values,
scoring='roc_auc', cv=skf, n_jobs=1, verbose=1,
fit_params=fit_params)
fold = 0

return scores.mean()
for train_index, test_index in skf.split(np.zeros(training_data.shape[0]), training_data[target]):
fold = fold + 1
# print("TRAIN INDEX:", train_index, "TEST INDEX:", test_index)
train = pp.preprocess_common(training_data.iloc[train_index, 0:training_data.shape[1]])
test = pp.preprocess_common(training_data.iloc[test_index, 0:training_data.shape[1]])
train_df = pp.preprocess_confidence(train)
test_df = pp.preprocess_confidence(train, test)

gbm = lgb_train(lgb_params, train_df, predictors, target,
categorical_features=categorical_features, validation_data=validation_data)

y_hat = gbm.predict(test_df[predictors].values)

score = roc_auc_score(test_df[target].values, y_hat)
#myplot.plot_roc_curve(test[target].values, y_hat, score)
print("fold=%d, auc: %.2f%%" % (fold, score))
scores.append(score)
return np.mean(scores)


def lgb_train(params, training_data, predictors, target,
Expand Down Expand Up @@ -136,27 +154,18 @@ def main():
level=args.log)

logging.info('Preprocessing...')

# Load the training data, i.e. "the 90%"
train_df = pp.load_train(args.train_file, int(args.number_lines)
if args.number_lines is not None else None)
train_df = pp.preprocess_confidence(train_df)

# Load the validation data, i.e. "the 10%"

# Load training data set, i.e. "the 90%"
train_df = pp.load_train_raw(args.train_file, 2699999)

# Load validation data set, i.e. "the 10%"
if args.valid_file is not None:
valid_df = pp.load_train(args.valid_file)
valid_df = pp.preprocess_confidence(train_df, valid_df)
else:
valid_df = None

# Load the test data set, i.e. the data for which we need to make predictions
valid_df = pp.load_train(args.valid_file, 300002)
valid_df = pp.preprocess_confidence(valid_df)
# Load the test data set, i.e. data for which we need to make predictions
if args.test_file is not None:
test_df = pp.load_test(args.test_file)
test_df = pp.preprocess_confidence(train_df, test_df)
else:
test_df = None
test_df = pp.load_test_raw(args.test_file)

# Column we're trying to predict
target = 'is_attributed'

# Provide default hyperparameter values
Expand Down Expand Up @@ -200,7 +209,7 @@ def main():
# Run cross-validation
logging.info('Cross-validation part...')
score = lgb_cv(lgb_params, train_df, pp.predictors, target,
categorical_features=pp.categorical, n_splits=5,
categorical_features=pp.categorical, n_splits=3,
validation_data=valid_df)
logging.info('Average score across the folds: {}'.format(score))

Expand Down
18 changes: 18 additions & 0 deletions trainer/plotting_functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt


def plot_roc_curve(y, yhat, roc_auc):
fpr, tpr, _ = roc_curve(y, yhat)
plt.figure()
lw = 2
plt.plot(fpr, tpr, color='darkorange',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic example')
plt.legend(loc="lower right")
plt.show()
32 changes: 14 additions & 18 deletions trainer/preprocessing.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,15 +35,14 @@


# Columns our predictions are based on
predictors = ['app', 'device', 'os', 'channel', 'hour', 'hour_sq',
'count_ip_day_hour', 'count_ip_hour_os', 'count_ip_hh_app',
'count_ip_hour_device', 'ip_confRate', 'app_confRate',
'device_confRate', 'os_confRate', 'channel_confRate',
predictors = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq',
'count_ip_day_hour', 'count_ip_hour_os',
'count_ip_hh_app', 'count_ip_hour_device', 'ip_confRate',
'app_confRate','device_confRate', 'os_confRate', 'channel_confRate',
'app_channel_confRate', 'app_os_confRate', 'app_device_confRate',
'channel_os_confRate', 'channel_device_confRate',
'os_device_confRate']
categorical = ['app', 'device', 'os', 'channel', 'hour', 'hour_sq',
'count_ip_day_hour', 'count_ip_hour_os',
'channel_os_confRate', 'channel_device_confRate', 'os_device_confRate']
categorical = ['ip', 'app', 'device', 'os', 'channel', 'hour', 'hour_sq',
'count_ip_day_hour', 'count_ip_hour_os',
'count_ip_hh_app', 'count_ip_hour_device']


Expand All @@ -55,7 +54,7 @@ def reformat_click_time(df):
df.drop(['click_time'], axis=1, inplace=True)


def _preprocess_common(df):
def preprocess_common(df):
"""
Data transformations that should be done to both training and test data.
"""
Expand Down Expand Up @@ -143,7 +142,7 @@ def rate_calculation(x):
return rate * conf


def preprocess_confidence(train_df, test_df=None):
def preprocess_confidence(train_df, test_df=None, valid_df=None):
"""
Feature creation that should be done given training data and then merged \
with test data.
Expand Down Expand Up @@ -240,13 +239,12 @@ def load_train_raw(filename, number_samples):
nrows=number_samples)


def load_test_raw(filename, number_samples):
def load_test_raw(filename):
columns = ['ip','app','device','os', 'channel', 'click_time',
'click_id']
logging.info('Loading unlabeled data from {!r}...'.format(filename))
with open_dispatching(filename, mode='rb') as f:
return pd.read_csv(f, dtype=DTYPES, usecols=columns,
nrows=number_samples)
return pd.read_csv(f, dtype=DTYPES, usecols=columns)


def load_train(filename, number_samples=None):
Expand All @@ -256,14 +254,12 @@ def load_train(filename, number_samples=None):
"""
if number_samples < 0:
number_samples = None
return _preprocess_common(load_train_raw(filename, number_samples))
return preprocess_common(load_train_raw(filename, number_samples))


def load_test(filename, number_samples=None):
def load_test(filename):
"""
Reads and preprocesses unlabeled data from `filename`. This method should be
called for test data preprocessing.
"""
if number_samples < 0:
number_samples = None
return _preprocess_common(load_test_raw(filename, number_samples))
return preprocess_common(load_test_raw(filename))