Skip to content

Commit

Permalink
fix flake8 warnings
Browse files Browse the repository at this point in the history
  • Loading branch information
svirpioj committed May 25, 2021
1 parent 66be02a commit 31d0d4d
Show file tree
Hide file tree
Showing 5 changed files with 41 additions and 43 deletions.
40 changes: 18 additions & 22 deletions opusfilter/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,8 +153,8 @@ def write_probs(self, input_fname, output_fname, true_label=None,
probas = self.classifier.predict_proba(df[self.features])
if true_label:
true_labels = df_tbc[true_label]
logger.info('roc_auc: %s', roc_auc_score(true_labels, probas[:,1]))
for proba in probas[:,1]:
logger.info('roc_auc: %s', roc_auc_score(true_labels, probas[:, 1]))
for proba in probas[:, 1]:
output.write('{0:.10f}\n'.format(proba))

def weights(self):
Expand All @@ -172,7 +172,7 @@ class TrainClassifier:
"""Classify clean and noisy sentence pairs"""

def __init__(self, training_scores=None, dev_scores=None, model_type=None,
model_parameters=None, features=None, **kwargs):
model_parameters=None, features=None, **kwargs):
logger.info("Loading training data")
self.df_training_data = load_dataframe(training_scores)

Expand All @@ -198,11 +198,11 @@ def __init__(self, training_scores=None, dev_scores=None, model_type=None,
self.dev_data = None
self.dev_labels = None

if model_type == None:
if model_type is None:
self.model_type = 'LogisticRegression'
else:
self.model_type = model_type
if model_parameters == None:
if model_parameters is None:
self.model_parameters = {}
else:
self.model_parameters = model_parameters
Expand All @@ -220,7 +220,7 @@ def get_roc_auc(self, model, dev_data):
# pred = model.classifier.predict(dev_data)
# logger.info("Classifier labels: %s", model.classifier.classes_)
# logger.info("Predicted labels: %s", collections.Counter(pred))
return roc_auc_score(self.dev_labels, probs[:,1])
return roc_auc_score(self.dev_labels, probs[:, 1])

def get_sse(self, model, training_data, labels):
"""Calculate the residual sum of squares"""
Expand All @@ -237,17 +237,16 @@ def get_ce(self, model, training_data, labels):
def get_aic(self, model, training_data, labels):
"""Calculate AIC for a given model"""
loss = self.get_ce(model, training_data, labels)
k = training_data.shape[1] # number of variables
AIC = 2*k - 2*math.log(loss)
k = training_data.shape[1] # number of variables
AIC = 2 * k - 2 * math.log(loss)
return AIC

def get_bic(self, model, training_data, labels):
"""Calculate BIC for a given model"""
loss = self.get_ce(model, training_data, labels)
k = training_data.shape[1] # number of variables
n = training_data.shape[0] # number of observations
BIC = n*math.log(loss/n) + k*math.log(n)
#BIC = math.log(n)*k - 2*math.log(loss)
k = training_data.shape[1] # number of variables
n = training_data.shape[0] # number of observations
BIC = n * math.log(loss / n) + k * math.log(n)
return BIC

def get_labels(self, training_data, cutoffs):
Expand Down Expand Up @@ -299,16 +298,13 @@ def _load_feature_bounds_and_init(fdict):

def find_best_model(self, criterion_name, algorithm='default', options=None):
"""Find the model with the best AIC / BIC / SSE / CE / ROC_AUC"""
criteria = {'AIC':
{'func': self.get_aic, 'best': 'low', 'dev': False},
'BIC':
{'func': self.get_bic, 'best': 'low', 'dev': False},
'SSE':
{'func': self.get_sse, 'best': 'low', 'dev': False},
'CE':
{'func': self.get_ce, 'best': 'low', 'dev': False},
'ROC_AUC':
{'func': self.get_roc_auc, 'best': 'high', 'dev': True}}
criteria = {
'AIC': {'func': self.get_aic, 'best': 'low', 'dev': False},
'BIC': {'func': self.get_bic, 'best': 'low', 'dev': False},
'SSE': {'func': self.get_sse, 'best': 'low', 'dev': False},
'CE': {'func': self.get_ce, 'best': 'low', 'dev': False},
'ROC_AUC': {'func': self.get_roc_auc, 'best': 'high', 'dev': True}
}

if criterion_name not in criteria.keys():
raise ValueError('Invalid criterion. Expected one of: {}'.format(
Expand Down
11 changes: 7 additions & 4 deletions opusfilter/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,11 @@
from bs4 import BeautifulSoup as bs

from . import FilterABC, ConfigurationError
from .lm import CrossEntropyFilter, CrossEntropyDifferenceFilter
from .word_alignment import WordAlignFilter
from .lm import CrossEntropyFilter, CrossEntropyDifferenceFilter # noqa: F401
from .word_alignment import WordAlignFilter # noqa: F401


logger = logging.getLogger(__name__)


class LengthFilter(FilterABC):
Expand Down Expand Up @@ -197,7 +200,7 @@ def confidence(self, sentence, lan):
if self.id_method == 'cld2':
try:
clddetails = pycld2.detect(sentence)
except Exception as exp:
except Exception:
clddetails = (0, 0, ((0, 'un', 0.0), 0))

cldlan = clddetails[2][0][1]
Expand All @@ -209,7 +212,7 @@ def confidence(self, sentence, lan):
elif self.id_method == 'langid':
try:
lidetails = self.identifier.classify(sentence)
except Exception as exp:
except Exception:
lidetails = ('un', 0.0)
lilan, liconf = lidetails[0], round(lidetails[1], 2)
if lilan != lan:
Expand Down
30 changes: 15 additions & 15 deletions opusfilter/opusfilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,8 +277,7 @@ def get_subset(self, parameters, overwrite=False):
logger.info("Sampling subset of %s lines from total %s lines", size, total)
if shuffle_subset:
sample = random.sample(range(total), size)
with file_open(inputs[0]) as inf, \
file_open(outputs[0], 'w') as outf:
with file_open(infiles[0]) as inf, file_open(outfiles[0], 'w') as outf:
for line in self._yield_subset(inf, sample):
outf.write(line)
for infname, outfname in zip(infiles[1:], outfiles[1:]):
Expand Down Expand Up @@ -361,16 +360,17 @@ def train_classifier(self, parameters, overwrite=False):
if not overwrite and os.path.isfile(model_out):
logger.info("Output file exists, skipping step")
return
training_scores = os.path.join(self.output_dir,
parameters['training_scores'])
training_scores = os.path.join(self.output_dir, parameters['training_scores'])
dev_scores = os.path.join(self.output_dir, parameters['dev_scores']) \
if 'dev_scores' in parameters else None
trainer = classifier.TrainClassifier(training_scores=training_scores,
dev_scores=dev_scores, model_type=parameters['model_type'],
model_parameters=parameters['model_parameters'],
features=parameters['features'])
trainer = classifier.TrainClassifier(
training_scores=training_scores,
dev_scores=dev_scores, model_type=parameters['model_type'],
model_parameters=parameters['model_parameters'],
features=parameters['features']
)
model, value, features = trainer.find_best_model(
parameters['criterion'], **parameters.get('optimization', {}))
parameters['criterion'], **parameters.get('optimization', {}))

logger.info('Best model has {criterion}: {value}'.format(
criterion=parameters['criterion'], value=value))
Expand All @@ -387,8 +387,8 @@ def train_classifier(self, parameters, overwrite=False):

logger.info('Saving best model to {}'.format(model_out))

#with file_open(model_out, 'wb') as model_file:
#TODO: ValueError: binary mode doesn't take an encoding argument
# with file_open(model_out, 'wb') as model_file:
# TODO: ValueError: binary mode doesn't take an encoding argument
with open(model_out, 'wb') as model_file:
pickle.dump(model, model_file)

Expand All @@ -401,13 +401,13 @@ def classify(self, parameters, overwrite=False):
self.output_dir, parameters['output_probabilities']) \
if 'output_probabilities' in parameters else None
if (not overwrite and
(labels_out is None or os.path.isfile(labels_out)) and
(probs_out is None or os.path.isfile(probs_out))):
(labels_out is None or os.path.isfile(labels_out)) and
(probs_out is None or os.path.isfile(probs_out))):
logger.info("Output files exists, skipping step")
return
model_in = os.path.join(self.output_dir, parameters['model'])
#with file_open(model_in, 'rb') as model_file:
#TODO: ValueError: binary mode doesn't take an encoding argument
# with file_open(model_in, 'rb') as model_file:
# TODO: ValueError: binary mode doesn't take an encoding argument
with open(model_in, 'rb') as model_file:
model = pickle.load(model_file)
scores_in = os.path.join(self.output_dir, parameters['scores'])
Expand Down
2 changes: 1 addition & 1 deletion opusfilter/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ def get_score_tuples(self):
if counts[nametuple] > 1:
instances[nametuple] += 1
newtuple = (clsname, str(instances[nametuple])) if name is None \
else (clsname, name, str(instances[nametuple]))
else (clsname, name, str(instances[nametuple]))
else:
newtuple = (clsname, ) if name is None else (clsname, name)
renamed.append(newtuple)
Expand Down
1 change: 0 additions & 1 deletion opusfilter/preprocessors.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
"""Corpus preprocessing"""

import collections
from functools import reduce
import operator
import re
Expand Down

0 comments on commit 31d0d4d

Please sign in to comment.