Skip to content

Commit

Permalink
Last commit after submitting
Browse files Browse the repository at this point in the history
  • Loading branch information
lorainemg committed May 1, 2021
1 parent cf79a5e commit f78f90a
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 75 deletions.
46 changes: 26 additions & 20 deletions scripts/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,13 +28,19 @@ def fit(self, path: Path):
collection = Collection().load_dir(path)

print(f"Loaded {len(collection)} sentences for fitting.")
print('Starting ner classifier training')
self.ner_classifier.train(collection)
print('Starting re classifier training')
self.re_classifier.train(collection)
try:
print('Loading ner classifer')
self.ner_classifier.load_model('ner')
print('Loading re classifier')
self.re_classifier.load_model('re')
except:
print('Starting ner classifier training')
self.ner_classifier.train(collection)
print('Starting re classifier training')
self.re_classifier.train(collection)
# print(f"Training completed: Stored {len(keyphrases)} keyphrases and {len(relations)} relation pairs.")

def eval(self, path: Path, scenarios: List[int], submit: Path):
def eval(self, path: Path, scenarios: List[int], submit: Path, run):
"""Function that evals according to the baseline classifier"""
# Its not changed
for id in scenarios:
Expand All @@ -47,8 +53,8 @@ def eval(self, path: Path, scenarios: List[int], submit: Path):
print(f"Loaded {len(input_data)} input sentences.")
output_data = self.run(input_data, taskA, taskB)

print(f"Writing output to {submit / folder}")
output_data.dump(submit / folder / "output.txt", skip_empty_sentences=False)
print(f"Writing output to {submit / run / folder }")
output_data.dump(submit / run / folder / "output.txt", skip_empty_sentences=False)


def run(self, collection, taskA, taskB):
Expand Down Expand Up @@ -85,38 +91,38 @@ def main():
eval_ = Path('2021/eval/testing')
print(f'Evaluating testing run {i}')
scenarios = [1, 2, 3]
submit_ = Path(f'2021/submissions/classifier/testing/run{i}')
submit_ = Path(f'2021/submissions/classifier/testing/')

clsf.eval(eval_, scenarios, submit_, f'run{i}')

clsf.eval(eval_, scenarios, submit_)

score.main(Path('../2021/eval/testing'),
Path('../2021/submissions/classifier/testing'),
score.main(Path('2021/eval/testing'),
Path('2021/submissions/classifier/testing'),
runs=[1,2,3], scenarios=[1,2,3], verbose=True, prefix="")


for i in range(3):
eval_ = Path('2021/eval/training')
print(f'Evaluating training run {i}')
scenarios = [1, 2, 3]
submit_ = Path(f'2021/submissions/classifier/training/run{i}')
submit_ = Path(f'2021/submissions/classifier/training/')

clsf.eval(eval_, scenarios, submit_)
clsf.eval(eval_, scenarios, submit_, f'run{i}')

score.main(Path('../2021/eval/training'),
Path('../2021/submissions/classifier/training'),
score.main(Path('2021/eval/training'),
Path('2021/submissions/classifier/training'),
runs=[1,2,3], scenarios=[1,2,3], verbose=True, prefix="")

for i in range(3):
eval_ = Path('2021/eval/develop')
print(f'Evaluating develop run {i}')
scenarios = [1, 2, 3]
submit_ = Path(f'2021/submissions/classifier/develop/run{i}')
submit_ = Path(f'2021/submissions/classifier/develop/')

clsf.eval(eval_, scenarios, submit_)
clsf.eval(eval_, scenarios, submit_, f'run{i}')


score.main(Path('../2021/eval/develop'),
Path('../2021/submissions/classifier/develop'),
score.main(Path('2021/eval/develop'),
Path('2021/submissions/classifier/develop'),
runs=[1,2,3], scenarios=[1,2,3], verbose=True, prefix="")

if __name__ == "__main__":
Expand Down
62 changes: 31 additions & 31 deletions scripts/ner_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,16 +28,16 @@ def __init__(self):
self.n_entities = 4
self.encoder_tags = LabelEncoder()
self.encoder_entities = LabelEncoder()
self.ScieloSku = fasttext.load_model("./Scielo_cbow_cased.bin")
# self.ScieloSku = fasttext.load_model("./Scielo_cbow_cased.bin")

def train(self, collection: Collection):
"""
Wrapper function where of the process of training is done
"""
features, X_char, my_embedding, tags, entities = self.get_sentences(collection)
features, X_char, tags, entities = self.get_sentences(collection)
X, (y_tags, y_entities) = self.preprocessing(features, (tags, entities))
self.get_model()
return self.fit_model((X, X_char, my_embedding), (y_tags, y_entities))
return self.fit_model((X, X_char), (y_tags, y_entities))

def get_model(self):
"""
Expand All @@ -47,9 +47,9 @@ def get_model(self):
# input for words
inputs = Input(shape=(None, self.n_features))
# outputs = Embedding(input_dim=35179, output_dim=20,
emb_in = Input(shape=(None, 300))
# input_length=self.X_shape[1], mask_zero=True)(inputs) # 20-dim embedding
emb_mask = Masking(mask_value=0, input_shape=(None, 10))(emb_in)
# emb_in = Input(shape=(None, 300))
# # input_length=self.X_shape[1], mask_zero=True)(inputs) # 20-dim embedding
# emb_mask = Masking(mask_value=0, input_shape=(None, 10))(emb_in)
# input for characters
char_in = Input(shape=(None, 10))
# inputs of the embeddings
Expand All @@ -59,10 +59,10 @@ def get_model(self):
char_enc = TimeDistributed(LSTM(units=20, return_sequences=False, recurrent_dropout=0.5))(emb_char)

# main LSTM
x = concatenate((inputs, char_enc, emb_mask))
x = Bidirectional(LSTM(units=64, return_sequences=True,
x = concatenate((inputs, char_enc))
x = Bidirectional(LSTM(units=32, return_sequences=True,
recurrent_dropout=0.1))(x) # variational biLSTM
x = Bidirectional(LSTM(units=64, return_sequences=True,
x = Bidirectional(LSTM(units=32, return_sequences=True,
recurrent_dropout=0.2, dropout=0.2))(x)
# x = MaxPooling1D()(x)
out1 = TimeDistributed(Dense(self.n_tags, activation="softmax"))(x) # a dense layer as suggested by neuralNer
Expand All @@ -71,7 +71,7 @@ def get_model(self):
# crf = CRF(self.n_labels) # CRF layer
# outputs = crf(outputs) # output

model = Model(inputs=(inputs, char_in, emb_in), outputs=(out1, out2))
model = Model(inputs=(inputs, char_in), outputs=(out1, out2))
model.compile(optimizer="adam", metrics=self.metrics,
# loss=weighted_loss(categorical_crossentropy, self.weights))
loss=categorical_crossentropy)
Expand Down Expand Up @@ -101,27 +101,27 @@ def get_sentences(self, collection: Collection):
entities = []
X_char = []
self.char2idx = get_char2idx(collection)
embedding_vec = []
# embedding_vec = []
for sentence in collection:
feat, chars, embedding, tag, entity = load_training_entities(sentence, self.char2idx, self.ScieloSku)
feat, chars, tag, entity = load_training_entities(sentence, self.char2idx)
features.append(feat)
tags.append(tag)
entities.append(entity)
X_char.append(np.array(chars))
embedding_vec.append(embedding)
return features, X_char, embedding_vec, tags, entities
# embedding_vec.append(embedding)
return features, X_char, tags, entities

def get_features(self, collection: Collection):
"""Giving a collection, the features of its sentences are returned"""
features = []
X_char = []
embedding_vec = []
# embedding_vec = []
for sentence in collection:
(feat, chars), embedding = load_testing_entities(sentence, self.char2idx, self.ScieloSku)
feat, chars = load_testing_entities(sentence, self.char2idx)
features.append(feat)
X_char.append(chars)
embedding_vec.append(embedding)
return features, X_char, embedding_vec
# embedding_vec.append(embedding)
return features, X_char

def fit_model(self, X, y, plot=False):
"""
Expand All @@ -130,30 +130,30 @@ def fit_model(self, X, y, plot=False):
# hist = self.model.fit(X, y, batch_size=32, epochs=5,
# validation_split=0.2, verbose=1)
# hist = self.model.fit(MyBatchGenerator(X, y, batch_size=30), epochs=5)
X, X_char, my_Embedding = X
X, X_char = X
y_tags, y_entities = y
num_examples = len(X)

# self.model.fit(self.generator(X, y), steps_per_epoch=steps_per_epoch, epochs=5)
x_shapes, x_char_shapes, my_Embedding_shapes, yt_shapes, ye_shapes = train_by_shape(X, y_tags, y_entities,
X_char, my_Embedding)
x_shapes, x_char_shapes, yt_shapes, ye_shapes = train_by_shape(X, y_tags, y_entities,
X_char)
for shape in x_shapes:
self.model.fit(
(np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape]), np.asarray(my_Embedding_shapes[shape])),
# (np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape])),
# (np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape]), np.asarray(my_Embedding_shapes[shape])),
(np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape])),
(np.asarray(yt_shapes[shape]), np.asarray(ye_shapes[shape])),
epochs=5)

def test_model(self, collection: Collection) -> Collection:
collection = collection.clone()
features, X_char, my_embedding = self.get_features(collection)
features, X_char = self.get_features(collection)
X = self.preprocess_features(features, train=False)
x_shapes, x_char_shapes, my_embedding_shapes, indices = predict_by_shape(X, X_char, my_embedding)
x_shapes, x_char_shapes, indices = predict_by_shape(X, X_char)
pred_tags = []
pred_entities = []
for x_items, x_chars, z_items in zip(x_shapes, x_char_shapes, my_embedding_shapes):
pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars), np.asarray(z_items)))
# pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars)))
for x_items, x_chars in zip(x_shapes, x_char_shapes):
# pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars), np.asarray(z_items)))
pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars)))
pred_tags.extend(pt)
pred_entities.extend(pe)
labels_tags = self.convert_to_label(pred_tags, self.encoder_tags)
Expand Down Expand Up @@ -193,9 +193,9 @@ def load_model(self, name):
collection = Collection().load_dir(Path('2021/ref/training'))
# dev_set = Collection().load_dir(Path('2021/eval/develop/scenario1-main'))
ner_clf = NERClassifier()
# ner_clf.train(collection)
# ner_clf.save_model('ner')
ner_clf.load_model('ner')
ner_clf.train(collection)
ner_clf.save_model('ner')
# ner_clf.load_model('ner')
ner_clf.eval(Path('2021/eval/develop/'), Path('2021/submissions/ner/develop/run1'))
score.main(Path('2021/eval/develop'),
Path('2021/submissions/ner/develop'),
Expand Down
36 changes: 18 additions & 18 deletions scripts/ner_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,22 +85,22 @@ def get_vec(tokens, model, lang):
return [np.zeros(300) for _ in tokens]


def load_training_entities(sentence: Sentence, char2idx, model):
def load_training_entities(sentence: Sentence, char2idx):
lang = detect_language(sentence.text)
nlp = nlp_es if lang == 'es' else nlp_en
doc = nlp(sentence.text)
embedding = get_vec(doc, model, lang)
# embedding = get_vec(doc, model, lang)
features, X_char = get_features(doc, char2idx)
tags, entities = get_labels(doc, sentence)
return features, X_char, embedding, list(tags), list(entities)
return features, X_char, list(tags), list(entities)


def load_testing_entities(sentence: Sentence, char2idx, model):
def load_testing_entities(sentence: Sentence, char2idx):
lang = detect_language(sentence.text)
nlp = nlp_es if lang == 'es' else nlp_en
doc = nlp(sentence.text)
embedding = get_vec(doc, model, lang)
return get_features(doc, char2idx), embedding
# embedding = get_vec(doc, model, lang)
return get_features(doc, char2idx) #, embedding


def get_char2idx(collection: Collection):
Expand All @@ -116,7 +116,7 @@ def get_char2idx(collection: Collection):
return char2idx


def train_by_shape(X, y_tags, y_ents, X_char, my_embedding):
def train_by_shape(X, y_tags, y_ents, X_char):
"""
Separates the features and labels by its shape
:param X: Word-features
Expand All @@ -128,24 +128,24 @@ def train_by_shape(X, y_tags, y_ents, X_char, my_embedding):
yt_shapes = {}
ye_shapes = {}
x_char_shapes = {}
my_embedding_shapes = {}
for itemX, X_char, y_t, y_e, itemZ in zip(X, X_char, y_tags, y_ents, my_embedding):
# my_embedding_shapes = {}
for itemX, X_char, y_t, y_e in zip(X, X_char, y_tags, y_ents):
try:
x_shapes[itemX.shape[0]].append(itemX)
x_char_shapes[itemX.shape[0]].append(X_char)
yt_shapes[itemX.shape[0]].append(y_t)
ye_shapes[itemX.shape[0]].append(y_e)
my_embedding_shapes[itemX.shape[0]].append(itemZ)
# my_embedding_shapes[itemX.shape[0]].append(itemZ)
except KeyError:
x_shapes[itemX.shape[0]] = [itemX] # initially a list, because we're going to append items
x_char_shapes[itemX.shape[0]] = [X_char]
yt_shapes[itemX.shape[0]] = [y_t]
ye_shapes[itemX.shape[0]] = [y_e]
my_embedding_shapes[itemX.shape[0]] = [itemZ]
return x_shapes, x_char_shapes, my_embedding_shapes, yt_shapes, ye_shapes
# my_embedding_shapes[itemX.shape[0]] = [itemZ]
return x_shapes, x_char_shapes, yt_shapes, ye_shapes


def predict_by_shape(X, X_char, my_embedding):
def predict_by_shape(X, X_char):
"""
Separates the features by its shape
:param X: Word-features
Expand All @@ -155,19 +155,19 @@ def predict_by_shape(X, X_char, my_embedding):
x_char_shapes = {}
x_shapes = {}
indices = {}
my_embedding_shapes = {}
for i, (itemX, X_char, itemZ) in enumerate(zip(X, X_char, my_embedding)):
# my_embedding_shapes = {}
for i, (itemX, X_char) in enumerate(zip(X, X_char)):
try:
x_char_shapes[itemX.shape[0]].append(X_char)
x_shapes[len(itemX)].append(itemX)
indices[len(itemX)].append(i)
my_embedding_shapes[len(itemX)].append(itemZ)
# my_embedding_shapes[len(itemX)].append(itemZ)
except KeyError:
x_shapes[len(itemX)] = [itemX] # initially a list, because we're going to append items
x_char_shapes[itemX.shape[0]] = [X_char]
indices[len(itemX)] = [i]
my_embedding_shapes[len(itemX)] = [itemZ]
return x_shapes.values(), x_char_shapes.values(), my_embedding_shapes.values(), chain(*indices.values())
# my_embedding_shapes[len(itemX)] = [itemZ]
return x_shapes.values(), x_char_shapes.values(), chain(*indices.values())


################################ Postprocessing ################################
Expand Down
11 changes: 5 additions & 6 deletions scripts/re_clsf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
from utils import weighted_loss
import numpy as np
import fasttext
from tensorflow_addons.metrics import FBetaScore


class REClassifier(BaseClassifier):
Expand Down Expand Up @@ -53,8 +52,8 @@ def get_model(self):
x = concatenate([inputs, x, emb_in])
x = Bidirectional(LSTM(units=32, return_sequences=True,
recurrent_dropout=0.1))(x) # variational biLSTM
x = Bidirectional(LSTM(units=32, return_sequences=True,
recurrent_dropout=0.2, dropout=0.2))(x)
# x = Bidirectional(LSTM(units=32, return_sequences=True,
# recurrent_dropout=0.2, dropout=0.2))(x)
outputs = TimeDistributed(Dense(self.n_labels, activation="softmax"))(
x) # a dense layer as suggested by neuralNer
# crf = CRF(8) # CRF layer
Expand Down Expand Up @@ -171,9 +170,9 @@ def eval(self, path: Path, submit: Path):
if __name__ == "__main__":
collection = Collection().load_dir(Path('../2021/ref/training'))
re_clf = REClassifier()
re_clf.train(collection)
re_clf.save_model('re')
# # re_clf.load_model('re')
# re_clf.train(collection)
# re_clf.save_model('re')
re_clf.load_model('re')
re_clf.eval(Path('../2021/eval/develop/'), Path('../2021/submissions/ner/develop/run1'))
score.main(Path('../2021/eval/develop'),
Path('../2021/submissions/ner/develop'),
Expand Down

0 comments on commit f78f90a

Please sign in to comment.