Clean-up internal data api for faa_aligner

bjascob · bjascob · commit abeed54ff11a · 2021-01-19T08:36:19.000-07:00
diff --git a/amrlib/alignments/faa_aligner/faa_aligner.py b/amrlib/alignments/faa_aligner/faa_aligner.py
@@ -17,13 +17,22 @@
 class FAA_Aligner(object):
     def __init__(self, **kwargs):
         self.model_dir    = kwargs.get('model_dir',    os.path.join(data_dir, 'model_aligner_faa'))
-        self.working_dir  = kwargs.get('working_dir',  os.path.join(data_dir, 'working_faa_aligner'))
         self.model_tar_fn = kwargs.get('model_tar_fn', os.path.join(this_dir, 'model_aligner_faa.tar.gz'))
-        os.makedirs(self.working_dir, exist_ok=True)
         self.setup_model_dir()
         self.aligner = TrainedAligner(self.model_dir, **kwargs)
         self.aligner.check_for_binaries()   # Will raise FileNotFoundError if binaries can't be found
 
+    # Input space_tok_sents is a list of space tokenized strings
+    # graph_strings is a list and amr graph strings, the same size.
+    def align_sents(self, space_tok_sents, graph_strings):
+        assert len(space_tok_sents) == len(graph_strings)
+        graph_strings = [to_graph_line(g) for g in graph_strings]
+        data = preprocess_infer(space_tok_sents, graph_strings)
+        data.model_out_lines = self.aligner.align(data.eng_preproc_lines, data.amr_preproc_lines)
+        amr_surface_aligns, alignment_strings = postprocess(data)
+        return amr_surface_aligns, alignment_strings
+
+
     # check the model directory, if it doesn't have the metadata file try to create
     # the directory from the tar.gz file
     def setup_model_dir(self):
@@ -43,13 +52,6 @@ def setup_model_dir(self):
             logger.critical('No model in model_dir and no local version available to extract')
             return False
 
-    def align_sents(self, sents, gstrings):
-        gstrings = [to_graph_line(g) for g in gstrings]
-        eng_td_lines, amr_td_lines = preprocess_infer(self.working_dir, sents, gstrings)
-        fa_out_lines = self.aligner.align(eng_td_lines, amr_td_lines)
-        amr_surface_aligns, alignment_strings = postprocess(self.working_dir, fa_out_lines, sents, gstrings)
-        return amr_surface_aligns, alignment_strings
-
 
 # Code adapted from from https://github.com/clab/fast_align/blob/master/src/force_align.py
 class TrainedAligner:
diff --git a/amrlib/alignments/faa_aligner/postprocess.py b/amrlib/alignments/faa_aligner/postprocess.py
@@ -10,48 +10,28 @@
 
 
 # if model_out_lines is None, read from the file
-def postprocess(wk_dir, model_out_lines=None, eng_lines=None, amr_lines=None, **kwargs):
-    # Input filenames
-    eng_fn          = os.path.join(wk_dir, kwargs.get('eng_fn', 'sents.txt'))
-    amr_fn          = os.path.join(wk_dir, kwargs.get('amr_fn', 'gstrings.txt'))
-    eng_tok_pos_fn  = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt'))
-    amr_tuple_fn    = os.path.join(wk_dir, kwargs.get('amr_tuple', 'amr_tuple.txt'))
-    model_out_fn    = os.path.join(wk_dir, kwargs.get('model_out_fn', 'model_out.txt'))
+def postprocess(data, **kwargs):
     # Error log
-    align_to_str_fn = os.path.join(wk_dir, kwargs.get('align_to_str_fn', 'align_to_str.err'))
-
-    # Read the input files and get the number of lines, which must be the same
-    if eng_lines is None or amr_lines is None:
-        with open(eng_fn) as f:
-            eng_lines = [l.strip() for l in f]
-        with open(amr_fn) as f:
-            amr_lines = [l.strip() for l in f]
-        assert len(eng_lines) == len(amr_lines)
-    lines_number = len(eng_lines)
+    log_dir = kwargs.get('log_dir', 'logs')
+    os.makedirs(log_dir, exist_ok=True)
+    align_to_str_fn = os.path.join(log_dir, kwargs.get('postprocess_log_fn', 'faa_postprocess.log'))
 
     # Read the output of the aligner or use the supplied input above
     # fast_align outputs with a dash but the code from the isi aligner is setup for spaces
-    if model_out_lines is None:
-        with open(model_out_fn) as f:
-            model_out_lines = f.readlines()
-    # fast_align outputs with dashes, giza does this without
-    giza_align_lines = [l.strip().replace('-', ' ') for l in model_out_lines]
+    giza_align_lines = [l.strip().replace('-', ' ') for l in data.model_out_lines]
     isi_align_lines  = giza2isi(giza_align_lines)
-    align_real_lines = swap(isi_align_lines)[:lines_number]  # rm data added for training, past original sentences
+    num_lines        = len(data.amr_lines)
+    align_real_lines = swap(isi_align_lines)[:num_lines]  # rm data added for training, past original sentences
 
-    # Load the original sentence tokenization positions (created in pre-process)
-    with open(eng_tok_pos_fn) as f:
-        eng_tok_origpos_lines = [l.strip() for l in f]
-    align_origpos_lines = map_ibmpos_to_origpos_amr_as_f(eng_tok_origpos_lines, align_real_lines)
+    # Align the original position lines
+    align_origpos_lines = map_ibmpos_to_origpos_amr_as_f(data.eng_tok_origpos_lines, align_real_lines)
 
-    # Load the amr tuples from the pre-process steps and add the alignments
-    with open(amr_tuple_fn) as f:
-        amr_tuple_lines = [l.strip() for l in f]
-    aligned_tuple_lines = get_aligned_tuple_amr_as_f_add_align(amr_tuple_lines, align_origpos_lines)
+    # Get the aligned tuples
+    aligned_tuple_lines = get_aligned_tuple_amr_as_f_add_align(data.amr_tuple_lines, align_origpos_lines)
 
     # Create amr graphs with surface alignments
-    amr_surface_aligns = feat2tree.align(amr_lines, aligned_tuple_lines, log_fn=align_to_str_fn)
-    assert len(amr_surface_aligns) == len(eng_lines)
+    amr_surface_aligns = feat2tree.align(data.amr_lines, aligned_tuple_lines, log_fn=align_to_str_fn)
+    assert len(amr_surface_aligns) == len(data.amr_lines)
 
     # Get the final alignment string from the surface alignments
     ga = GetAlignments.from_amr_strings(amr_surface_aligns)
diff --git a/amrlib/alignments/faa_aligner/preprocess.py b/amrlib/alignments/faa_aligner/preprocess.py
@@ -4,61 +4,54 @@
 from   .process_utils import stem_4_letters_word, stem_4_letters_line, stem_4_letters_string
 from   .process_utils import filter_eng_by_stopwords, get_lineartok_with_rel
 from   .process_utils import get_id_mapping_uniq
-
+from   .proc_data import ProcData
 
 # Set the default data data for misc files
 default_res_dir = os.path.dirname(os.path.realpath(__file__))
 default_res_dir = os.path.realpath(os.path.join(default_res_dir, 'resources'))
 
 
 # Preprocess for inference
-def preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs):
+def preprocess_infer(eng_lines, amr_lines, **kwargs):
     assert len(eng_lines) == len(amr_lines)
     # Resource filenames
     res_dir         = kwargs.get('res_dir', default_res_dir)
     eng_sw_fn       = kwargs.get('eng_sw_fn', os.path.join(res_dir, 'eng_stopwords.txt'))
     amr_sw_fn       = kwargs.get('amr_sw_fn', os.path.join(res_dir, 'amr_stopwords.txt'))
-    # Output filenames
-    eng_tok_pos_fn  = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt'))
-    amr_tuple_fn    = os.path.join(wk_dir, kwargs.get('amr_tuple_fn', 'amr_tuple.txt'))
 
     # Filter out stopwords from sentences
     eng_tok_filtered_lines, eng_tok_origpos_lines = filter_eng_by_stopwords(eng_lines, eng_sw_fn)
-
-    # Save for post-processing
-    with open(eng_tok_pos_fn, 'w') as f:
-        for i, l in enumerate(eng_tok_origpos_lines):
-            assert l.strip(), '!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i
-            f.write(l + '\n')
+    for i, line in enumerate(eng_tok_origpos_lines):
+        if not line.strip():
+            raise ValueError('!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i)
 
     # Stem sentence tokens
-    eng_tok_stemmed_lines = [stem_4_letters_line(l) for l in eng_tok_filtered_lines]
+    eng_preproc_lines = [stem_4_letters_line(l) for l in eng_tok_filtered_lines]
 
     # Process the AMR data / remove stopwords
     amr_linear_lines, amr_tuple_lines = get_lineartok_with_rel(amr_lines, amr_sw_fn)
 
-    # Save for post-processing
-    with open(amr_tuple_fn, 'w') as f:
-        for l in amr_tuple_lines:
-            f.write(l + '\n')
-
     # Stem the AMR lines
-    amr_linear_stemmed_lines = []
+    amr_preproc_lines = []
     for line in amr_linear_lines:
         new_tokens = []
         for token in line.split():
             token = re.sub(r'\-[0-9]{2,3}$', '', token)
             token = token.replace('"', '')
             token = stem_4_letters_word(token).strip()
             new_tokens.append(token)
-        amr_linear_stemmed_lines.append(' '.join(new_tokens))
+        amr_preproc_lines.append(' '.join(new_tokens))
 
-    return eng_tok_stemmed_lines, amr_linear_stemmed_lines
+    # Gather the data
+    assert len(eng_preproc_lines) == len(amr_preproc_lines)
+    data = ProcData(eng_lines, amr_lines, eng_tok_origpos_lines, amr_tuple_lines,
+                    eng_preproc_lines, amr_preproc_lines,)
+    return data
 
 
 # Preprocess the training data.  This is the similar to inference but add a lot of
 # extra translation lines from resource files, etc..
-def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
+def preprocess_train(eng_lines, amr_lines, **kwargs):
     repeat_td       = kwargs.get('repeat_td', 10)   # 10X is original value from isi aligner
     # Resource filenames
     res_dir         = kwargs.get('res_dir', default_res_dir)
@@ -67,10 +60,12 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
     amr_id_map_fn   = kwargs.get('amr_id_map_fn', os.path.join(res_dir, 'amr_id_map.txt'))
 
     # Run the inference process which creates the basic translation data
-    eng_tok_stemmed_lines, amr_linear_stemmed_lines = preprocess_infer(wk_dir, eng_lines, amr_lines, **kwargs)
+    data = preprocess_infer(eng_lines, amr_lines, **kwargs)
+    eng_preproc_lines = data.eng_preproc_lines
+    amr_preproc_lines = data.amr_preproc_lines
 
     # Get tokens common between the two datasets (obvious translations
-    common_tok_lines = get_id_mapping_uniq(eng_tok_stemmed_lines, amr_linear_stemmed_lines)
+    common_tok_lines = get_id_mapping_uniq(eng_preproc_lines, amr_preproc_lines)
     eng_td_lines = common_tok_lines[:]  # copy
 
     # Append the second field in prep-roles.id.txt
@@ -101,8 +96,8 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
 
     # Create the final training data using the original sentences
     # and 10X copies of the additional data (other translations)
-    eng_td_lines = eng_tok_stemmed_lines    + [l for _ in range(repeat_td) for l in eng_td_lines]
-    amr_td_lines = amr_linear_stemmed_lines + [l for _ in range(repeat_td) for l in amr_td_lines]
-    assert len(eng_td_lines) == len(amr_td_lines)
+    data.eng_preproc_lines += [l for _ in range(repeat_td) for l in eng_td_lines]
+    data.amr_preproc_lines += [l for _ in range(repeat_td) for l in amr_td_lines]
+    assert len(data.eng_preproc_lines) == len(data.amr_preproc_lines)
 
-    return eng_td_lines, amr_td_lines
+    return data
diff --git a/amrlib/alignments/faa_aligner/proc_data.py b/amrlib/alignments/faa_aligner/proc_data.py
@@ -0,0 +1,63 @@
+import os
+
+# Simple container for holding process data
+# This is a little complicated because preprocess creates data that is used by the model for input
+# and other data that is used during postprocessing.  Keep track of it all here.
+# Saving and loading is to facilitate training scripts.  During inference, data will be held internally.
+class ProcData(object):
+    def __init__(self, eng_lines=None, amr_lines=None,
+                eng_tok_origpos_lines=None, amr_tuple_lines=None,
+                eng_preproc_lines=None, amr_preproc_lines=None):
+        self.eng_lines              = eng_lines
+        self.amr_lines              = amr_lines
+        self.eng_tok_origpos_lines  = eng_tok_origpos_lines
+        self.amr_tuple_lines        = amr_tuple_lines
+        self.eng_preproc_lines      = eng_preproc_lines
+        self.amr_preproc_lines      = amr_preproc_lines
+
+    # Save the preprocess and model input data (optionally the original x_lines data)
+    def save(self, wk_dir, save_input_data=False, **kwargs):
+        self.build_filenames(wk_dir, **kwargs)
+        if save_input_data:
+            self.save_lines(self.eng_fn, self.eng_lines)
+            self.save_lines(self.amr_fn, self.amr_lines)
+        self.save_lines(self.eng_tok_pos_fn, self.eng_tok_origpos_lines)
+        self.save_lines(self.amr_tuple_fn,   self.amr_tuple_lines)
+        with open(self.fa_in_fn, 'w') as f:
+            for en_line, amr_line in zip(self.eng_preproc_lines, self.amr_preproc_lines):
+                f.write('%s ||| %s\n' % (en_line, amr_line))
+
+    # load data (not including the _preproc_lines)
+    @classmethod
+    def from_directory(cls, wk_dir, **kwargs):
+        self = cls()
+        self.build_filenames(wk_dir, **kwargs)
+        self.eng_lines              = self.load_lines(self.eng_fn)
+        self.amr_lines              = self.load_lines(self.amr_fn)
+        self.eng_tok_origpos_lines  = self.load_lines(self.eng_tok_pos_fn)
+        self.amr_tuple_lines        = self.load_lines(self.amr_tuple_fn)
+        self.model_out_lines        = self.load_lines(self.model_out_fn)
+        return self
+
+    # Create default filenames as members
+    def build_filenames(self, wk_dir, **kwargs):
+        self.eng_fn         = os.path.join(wk_dir, kwargs.get('eng_fn',         'sents.txt'))
+        self.amr_fn         = os.path.join(wk_dir, kwargs.get('eng_fn',         'gstrings.txt'))
+        self.eng_tok_pos_fn = os.path.join(wk_dir, kwargs.get('eng_tok_pos_fn', 'eng_tok_origpos.txt'))
+        self.amr_tuple_fn   = os.path.join(wk_dir, kwargs.get('amr_tuple_fn',   'amr_tuple.txt'))
+        self.fa_in_fn       = os.path.join(wk_dir, kwargs.get('fa_in_fn',       'fa_in.txt'))
+        self.model_out_fn   = os.path.join(wk_dir, kwargs.get('model_out_fn',   'model_out.txt'))
+
+    # Save a list of lines to a file
+    @staticmethod
+    def save_lines(fn, lines):
+        with open(fn, 'w') as f:
+            for line in lines:
+                f.write(line + '\n')
+
+    # Load a list of lines from a file
+    @staticmethod
+    def load_lines(fn):
+        with open(fn) as f:
+            lines = [l.strip() for l in f]
+        return lines
diff --git a/scripts/61_FAA_Aligner/12_Preprocess_train.py b/scripts/61_FAA_Aligner/12_Preprocess_train.py
@@ -8,7 +8,6 @@
     working_dir = 'amrlib/data/train_faa_aligner'
     eng_fn      = os.path.join(working_dir, 'sents.txt')
     amr_fn      = os.path.join(working_dir, 'gstrings.txt')
-    fa_in_fn    = os.path.join(working_dir, 'fa_in.txt')
 
     print('Reading and writing data in', working_dir)
     # Read in the english sentences and linearized AMR lines
@@ -18,9 +17,8 @@
         amr_lines = [l.strip().lower() for l in f]
 
     # Proprocess the data
-    eng_td_lines, amr_td_lines = preprocess_train(working_dir, eng_lines, amr_lines)
+    data = preprocess_train(eng_lines, amr_lines)
 
-    # Save in fast align training format
-    with open(fa_in_fn, 'w') as f:
-        for en_line, amr_line in zip(eng_td_lines, amr_td_lines):
-            f.write('%s ||| %s\n' % (en_line, amr_line))
+    # Save the preprocess data and the model input file, the input data
+    # already in the working directory
+    data.save(working_dir, save_input_data=False)
diff --git a/scripts/61_FAA_Aligner/16_PostProcess.py b/scripts/61_FAA_Aligner/16_PostProcess.py
@@ -2,6 +2,7 @@
 import setup_run_dir    # this import tricks script to run from 2 levels up
 import os
 from   amrlib.alignments.faa_aligner.postprocess import postprocess
+from   amrlib.alignments.faa_aligner.proc_data import ProcData
 
 
 if __name__ == '__main__':
@@ -10,7 +11,12 @@
     surface_fn  = 'amr_surface_aligned.txt'
 
     print('Reading and writing data in', working_dir)
-    amr_surface_aligns, alignment_strings = postprocess(working_dir)
+
+    # Load the original, preprocess and model output data
+    data = ProcData.from_directory(working_dir)
+
+    # Post process
+    amr_surface_aligns, alignment_strings = postprocess(data)
 
     # Save the final data
     fpath = os.path.join(working_dir, astrings_fn)