Last commit after submitting

lorainemg · May 1, 2021 · f78f90a · f78f90a
1 parent cf79a5e
commit f78f90a
Show file tree

Hide file tree

Showing 4 changed files with 80 additions and 75 deletions.
diff --git a/scripts/classifier.py b/scripts/classifier.py
@@ -28,13 +28,19 @@ def fit(self, path: Path):
         collection = Collection().load_dir(path)
 
         print(f"Loaded {len(collection)} sentences for fitting.")
-        print('Starting ner classifier training')
-        self.ner_classifier.train(collection)
-        print('Starting re classifier training')
-        self.re_classifier.train(collection)
+        try:
+            print('Loading ner classifer')
+            self.ner_classifier.load_model('ner')
+            print('Loading re classifier')
+            self.re_classifier.load_model('re')
+        except:
+            print('Starting ner classifier training')
+            self.ner_classifier.train(collection)
+            print('Starting re classifier training')
+            self.re_classifier.train(collection)
         # print(f"Training completed: Stored {len(keyphrases)} keyphrases and {len(relations)} relation pairs.")
 
-    def eval(self, path: Path, scenarios: List[int], submit: Path):
+    def eval(self, path: Path, scenarios: List[int], submit: Path, run):
         """Function that evals according to the baseline classifier"""
         # Its not changed 
         for id in scenarios:
@@ -47,8 +53,8 @@ def eval(self, path: Path, scenarios: List[int], submit: Path):
             print(f"Loaded {len(input_data)} input sentences.")
             output_data = self.run(input_data, taskA, taskB)
 
-            print(f"Writing output to {submit / folder}")
-            output_data.dump(submit / folder / "output.txt", skip_empty_sentences=False)
+            print(f"Writing output to {submit / run / folder }")
+            output_data.dump(submit / run / folder  / "output.txt", skip_empty_sentences=False)
 
 
     def run(self, collection, taskA, taskB):
@@ -85,38 +91,38 @@ def main():
         eval_ = Path('2021/eval/testing')
         print(f'Evaluating testing run {i}')
         scenarios = [1, 2, 3]
-        submit_ = Path(f'2021/submissions/classifier/testing/run{i}')
+        submit_ = Path(f'2021/submissions/classifier/testing/')
+
+        clsf.eval(eval_, scenarios, submit_, f'run{i}')
 
-        clsf.eval(eval_, scenarios, submit_)
-
-    score.main(Path('../2021/eval/testing'),
-        Path('../2021/submissions/classifier/testing'),
+    score.main(Path('2021/eval/testing'),
+        Path('2021/submissions/classifier/testing'),
             runs=[1,2,3], scenarios=[1,2,3], verbose=True, prefix="")
 
 
     for i in range(3):
         eval_ = Path('2021/eval/training')
         print(f'Evaluating training run {i}')
         scenarios = [1, 2, 3]
-        submit_ = Path(f'2021/submissions/classifier/training/run{i}')
+        submit_ = Path(f'2021/submissions/classifier/training/')
 
-        clsf.eval(eval_, scenarios, submit_)
+        clsf.eval(eval_, scenarios, submit_, f'run{i}')
 
-    score.main(Path('../2021/eval/training'),
-        Path('../2021/submissions/classifier/training'),
+    score.main(Path('2021/eval/training'),
+        Path('2021/submissions/classifier/training'),
             runs=[1,2,3], scenarios=[1,2,3], verbose=True, prefix="")
 
     for i in range(3):
         eval_ = Path('2021/eval/develop')
         print(f'Evaluating develop run {i}')
         scenarios = [1, 2, 3]
-        submit_ = Path(f'2021/submissions/classifier/develop/run{i}')
+        submit_ = Path(f'2021/submissions/classifier/develop/')
 
-        clsf.eval(eval_, scenarios, submit_)
+        clsf.eval(eval_, scenarios, submit_, f'run{i}')
 
 
-    score.main(Path('../2021/eval/develop'),
-        Path('../2021/submissions/classifier/develop'),
+    score.main(Path('2021/eval/develop'),
+        Path('2021/submissions/classifier/develop'),
             runs=[1,2,3], scenarios=[1,2,3], verbose=True, prefix="")
 
 if __name__ == "__main__":

diff --git a/scripts/ner_clsf.py b/scripts/ner_clsf.py
@@ -28,16 +28,16 @@ def __init__(self):
         self.n_entities = 4
         self.encoder_tags = LabelEncoder()
         self.encoder_entities = LabelEncoder()
-        self.ScieloSku = fasttext.load_model("./Scielo_cbow_cased.bin")
+        # self.ScieloSku = fasttext.load_model("./Scielo_cbow_cased.bin")
 
     def train(self, collection: Collection):
         """
         Wrapper function where of the process of training is done
         """
-        features, X_char, my_embedding, tags, entities = self.get_sentences(collection)
+        features, X_char, tags, entities = self.get_sentences(collection)
         X, (y_tags, y_entities) = self.preprocessing(features, (tags, entities))
         self.get_model()
-        return self.fit_model((X, X_char, my_embedding), (y_tags, y_entities))
+        return self.fit_model((X, X_char), (y_tags, y_entities))
 
     def get_model(self):
         """
@@ -47,9 +47,9 @@ def get_model(self):
         # input for words
         inputs = Input(shape=(None, self.n_features))
         #         outputs = Embedding(input_dim=35179, output_dim=20,
-        emb_in = Input(shape=(None, 300))
-                                  # input_length=self.X_shape[1], mask_zero=True)(inputs)  # 20-dim embedding
-        emb_mask = Masking(mask_value=0, input_shape=(None, 10))(emb_in)
+        # emb_in = Input(shape=(None, 300))
+        #                           # input_length=self.X_shape[1], mask_zero=True)(inputs)  # 20-dim embedding
+        # emb_mask = Masking(mask_value=0, input_shape=(None, 10))(emb_in)
         # input for characters
         char_in = Input(shape=(None, 10))
         # inputs of the embeddings
@@ -59,10 +59,10 @@ def get_model(self):
         char_enc = TimeDistributed(LSTM(units=20, return_sequences=False, recurrent_dropout=0.5))(emb_char)
 
         # main LSTM
-        x = concatenate((inputs, char_enc, emb_mask))
-        x = Bidirectional(LSTM(units=64, return_sequences=True,
+        x = concatenate((inputs, char_enc))
+        x = Bidirectional(LSTM(units=32, return_sequences=True,
                                recurrent_dropout=0.1))(x)  # variational biLSTM
-        x = Bidirectional(LSTM(units=64, return_sequences=True,
+        x = Bidirectional(LSTM(units=32, return_sequences=True,
                                recurrent_dropout=0.2, dropout=0.2))(x)
         # x = MaxPooling1D()(x)
         out1 = TimeDistributed(Dense(self.n_tags, activation="softmax"))(x)  # a dense layer as suggested by neuralNer
@@ -71,7 +71,7 @@ def get_model(self):
         # crf = CRF(self.n_labels)  # CRF layer
         # outputs = crf(outputs)  # output
 
-        model = Model(inputs=(inputs, char_in, emb_in), outputs=(out1, out2))
+        model = Model(inputs=(inputs, char_in), outputs=(out1, out2))
         model.compile(optimizer="adam", metrics=self.metrics,
                       # loss=weighted_loss(categorical_crossentropy, self.weights))
                       loss=categorical_crossentropy)
@@ -101,27 +101,27 @@ def get_sentences(self, collection: Collection):
         entities = []
         X_char = []
         self.char2idx = get_char2idx(collection)
-        embedding_vec = []
+        # embedding_vec = []
         for sentence in collection:
-            feat, chars, embedding, tag, entity = load_training_entities(sentence, self.char2idx, self.ScieloSku)
+            feat, chars, tag, entity = load_training_entities(sentence, self.char2idx)
             features.append(feat)
             tags.append(tag)
             entities.append(entity)
             X_char.append(np.array(chars))
-            embedding_vec.append(embedding)
-        return features, X_char, embedding_vec, tags, entities
+            # embedding_vec.append(embedding)
+        return features, X_char, tags, entities
 
     def get_features(self, collection: Collection):
         """Giving a collection, the features of its sentences are returned"""
         features = []
         X_char = []
-        embedding_vec = []
+        # embedding_vec = []
         for sentence in collection:
-            (feat, chars), embedding = load_testing_entities(sentence, self.char2idx, self.ScieloSku)
+            feat, chars = load_testing_entities(sentence, self.char2idx)
             features.append(feat)
             X_char.append(chars)
-            embedding_vec.append(embedding)
-        return features, X_char, embedding_vec
+            # embedding_vec.append(embedding)
+        return features, X_char
 
     def fit_model(self, X, y, plot=False):
         """
@@ -130,30 +130,30 @@ def fit_model(self, X, y, plot=False):
         # hist = self.model.fit(X, y, batch_size=32, epochs=5,
         #             validation_split=0.2, verbose=1)
         # hist = self.model.fit(MyBatchGenerator(X, y, batch_size=30), epochs=5)
-        X, X_char, my_Embedding = X
+        X, X_char = X
         y_tags, y_entities = y
         num_examples = len(X)
 
         # self.model.fit(self.generator(X, y), steps_per_epoch=steps_per_epoch, epochs=5)
-        x_shapes, x_char_shapes, my_Embedding_shapes, yt_shapes, ye_shapes = train_by_shape(X, y_tags, y_entities,
-                                                                                            X_char, my_Embedding)
+        x_shapes, x_char_shapes, yt_shapes, ye_shapes = train_by_shape(X, y_tags, y_entities,
+                                                                                            X_char)
         for shape in x_shapes:
             self.model.fit(
-                (np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape]), np.asarray(my_Embedding_shapes[shape])),
-                # (np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape])),
+                # (np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape]), np.asarray(my_Embedding_shapes[shape])),
+                (np.asarray(x_shapes[shape]), np.asarray(x_char_shapes[shape])),
                 (np.asarray(yt_shapes[shape]), np.asarray(ye_shapes[shape])),
                 epochs=5)
 
     def test_model(self, collection: Collection) -> Collection:
         collection = collection.clone()
-        features, X_char, my_embedding = self.get_features(collection)
+        features, X_char = self.get_features(collection)
         X = self.preprocess_features(features, train=False)
-        x_shapes, x_char_shapes, my_embedding_shapes, indices = predict_by_shape(X, X_char, my_embedding)
+        x_shapes, x_char_shapes, indices = predict_by_shape(X, X_char)
         pred_tags = []
         pred_entities = []
-        for x_items, x_chars, z_items in zip(x_shapes, x_char_shapes, my_embedding_shapes):
-            pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars), np.asarray(z_items)))
-            # pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars)))
+        for x_items, x_chars in zip(x_shapes, x_char_shapes):
+            # pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars), np.asarray(z_items)))
+            pt, pe = self.model.predict((np.asarray(x_items), np.asarray(x_chars)))
             pred_tags.extend(pt)
             pred_entities.extend(pe)
         labels_tags = self.convert_to_label(pred_tags, self.encoder_tags)
@@ -193,9 +193,9 @@ def load_model(self, name):
     collection = Collection().load_dir(Path('2021/ref/training'))
     # dev_set = Collection().load_dir(Path('2021/eval/develop/scenario1-main'))
     ner_clf = NERClassifier()
-    # ner_clf.train(collection)
-    # ner_clf.save_model('ner')
-    ner_clf.load_model('ner')
+    ner_clf.train(collection)
+    ner_clf.save_model('ner')
+    # ner_clf.load_model('ner')
     ner_clf.eval(Path('2021/eval/develop/'), Path('2021/submissions/ner/develop/run1'))
     score.main(Path('2021/eval/develop'),
                Path('2021/submissions/ner/develop'),

diff --git a/scripts/ner_utils.py b/scripts/ner_utils.py
@@ -85,22 +85,22 @@ def get_vec(tokens, model, lang):
         return [np.zeros(300) for _ in tokens]
 
 
-def load_training_entities(sentence: Sentence, char2idx, model):
+def load_training_entities(sentence: Sentence, char2idx):
     lang = detect_language(sentence.text)
     nlp = nlp_es if lang == 'es' else nlp_en
     doc = nlp(sentence.text)
-    embedding = get_vec(doc, model, lang)
+    # embedding = get_vec(doc, model, lang)
     features, X_char = get_features(doc, char2idx)
     tags, entities = get_labels(doc, sentence)
-    return features, X_char, embedding, list(tags), list(entities)
+    return features, X_char, list(tags), list(entities)
 
 
-def load_testing_entities(sentence: Sentence, char2idx, model):
+def load_testing_entities(sentence: Sentence, char2idx):
     lang = detect_language(sentence.text)
     nlp = nlp_es if lang == 'es' else nlp_en
     doc = nlp(sentence.text)
-    embedding = get_vec(doc, model, lang)
-    return get_features(doc, char2idx), embedding
+    # embedding = get_vec(doc, model, lang)
+    return get_features(doc, char2idx) #, embedding
 
 
 def get_char2idx(collection: Collection):
@@ -116,7 +116,7 @@ def get_char2idx(collection: Collection):
     return char2idx
 
 
-def train_by_shape(X, y_tags, y_ents, X_char, my_embedding):
+def train_by_shape(X, y_tags, y_ents, X_char):
     """
     Separates the features and labels by its shape
     :param X: Word-features
@@ -128,24 +128,24 @@ def train_by_shape(X, y_tags, y_ents, X_char, my_embedding):
     yt_shapes = {}
     ye_shapes = {}
     x_char_shapes = {}
-    my_embedding_shapes = {}
-    for itemX, X_char, y_t, y_e, itemZ in zip(X, X_char, y_tags, y_ents, my_embedding):
+    # my_embedding_shapes = {}
+    for itemX, X_char, y_t, y_e in zip(X, X_char, y_tags, y_ents):
         try:
             x_shapes[itemX.shape[0]].append(itemX)
             x_char_shapes[itemX.shape[0]].append(X_char)
             yt_shapes[itemX.shape[0]].append(y_t)
             ye_shapes[itemX.shape[0]].append(y_e)
-            my_embedding_shapes[itemX.shape[0]].append(itemZ)
+            # my_embedding_shapes[itemX.shape[0]].append(itemZ)
         except KeyError:
             x_shapes[itemX.shape[0]] = [itemX]  # initially a list, because we're going to append items
             x_char_shapes[itemX.shape[0]] = [X_char]
             yt_shapes[itemX.shape[0]] = [y_t]
             ye_shapes[itemX.shape[0]] = [y_e]
-            my_embedding_shapes[itemX.shape[0]] = [itemZ]
-    return x_shapes, x_char_shapes, my_embedding_shapes, yt_shapes, ye_shapes
+            # my_embedding_shapes[itemX.shape[0]] = [itemZ]
+    return x_shapes, x_char_shapes, yt_shapes, ye_shapes
 
 
-def predict_by_shape(X, X_char, my_embedding):
+def predict_by_shape(X, X_char):
     """
     Separates the features by its shape
     :param X: Word-features
@@ -155,19 +155,19 @@ def predict_by_shape(X, X_char, my_embedding):
     x_char_shapes = {}
     x_shapes = {}
     indices = {}
-    my_embedding_shapes = {}
-    for i, (itemX, X_char, itemZ) in enumerate(zip(X, X_char, my_embedding)):
+    # my_embedding_shapes = {}
+    for i, (itemX, X_char) in enumerate(zip(X, X_char)):
         try:
             x_char_shapes[itemX.shape[0]].append(X_char)
             x_shapes[len(itemX)].append(itemX)
             indices[len(itemX)].append(i)
-            my_embedding_shapes[len(itemX)].append(itemZ)
+            # my_embedding_shapes[len(itemX)].append(itemZ)
         except KeyError:
             x_shapes[len(itemX)] = [itemX]  # initially a list, because we're going to append items
             x_char_shapes[itemX.shape[0]] = [X_char]
             indices[len(itemX)] = [i]
-            my_embedding_shapes[len(itemX)] = [itemZ]
-    return x_shapes.values(), x_char_shapes.values(), my_embedding_shapes.values(), chain(*indices.values())
+            # my_embedding_shapes[len(itemX)] = [itemZ]
+    return x_shapes.values(), x_char_shapes.values(), chain(*indices.values())
 
 
 ################################ Postprocessing ################################

diff --git a/scripts/re_clsf.py b/scripts/re_clsf.py
@@ -16,7 +16,6 @@
 from utils import weighted_loss
 import numpy as np
 import fasttext
-from tensorflow_addons.metrics import FBetaScore
 
 
 class REClassifier(BaseClassifier):
@@ -53,8 +52,8 @@ def get_model(self):
         x = concatenate([inputs, x, emb_in])
         x = Bidirectional(LSTM(units=32, return_sequences=True,
                                recurrent_dropout=0.1))(x)  # variational biLSTM
-        x = Bidirectional(LSTM(units=32, return_sequences=True,
-                                recurrent_dropout=0.2, dropout=0.2))(x)
+        # x = Bidirectional(LSTM(units=32, return_sequences=True,
+        #                         recurrent_dropout=0.2, dropout=0.2))(x)
         outputs = TimeDistributed(Dense(self.n_labels, activation="softmax"))(
             x)  # a dense layer as suggested by neuralNer
         #         crf = CRF(8)  # CRF layer
@@ -171,9 +170,9 @@ def eval(self, path: Path, submit: Path):
 if __name__ == "__main__":
     collection = Collection().load_dir(Path('../2021/ref/training'))
     re_clf = REClassifier()
-    re_clf.train(collection)
-    re_clf.save_model('re')
-    # # re_clf.load_model('re')
+    # re_clf.train(collection)
+    # re_clf.save_model('re')
+    re_clf.load_model('re')
     re_clf.eval(Path('../2021/eval/develop/'), Path('../2021/submissions/ner/develop/run1'))
     score.main(Path('../2021/eval/develop'),
                Path('../2021/submissions/ner/develop'),