44from .process_utils import stem_4_letters_word , stem_4_letters_line , stem_4_letters_string
55from .process_utils import filter_eng_by_stopwords , get_lineartok_with_rel
66from .process_utils import get_id_mapping_uniq
7-
7+ from . proc_data import ProcData
88
99# Set the default data data for misc files
1010default_res_dir = os .path .dirname (os .path .realpath (__file__ ))
1111default_res_dir = os .path .realpath (os .path .join (default_res_dir , 'resources' ))
1212
1313
1414# Preprocess for inference
15- def preprocess_infer (wk_dir , eng_lines , amr_lines , ** kwargs ):
15+ def preprocess_infer (eng_lines , amr_lines , ** kwargs ):
1616 assert len (eng_lines ) == len (amr_lines )
1717 # Resource filenames
1818 res_dir = kwargs .get ('res_dir' , default_res_dir )
1919 eng_sw_fn = kwargs .get ('eng_sw_fn' , os .path .join (res_dir , 'eng_stopwords.txt' ))
2020 amr_sw_fn = kwargs .get ('amr_sw_fn' , os .path .join (res_dir , 'amr_stopwords.txt' ))
21- # Output filenames
22- eng_tok_pos_fn = os .path .join (wk_dir , kwargs .get ('eng_tok_pos_fn' , 'eng_tok_origpos.txt' ))
23- amr_tuple_fn = os .path .join (wk_dir , kwargs .get ('amr_tuple_fn' , 'amr_tuple.txt' ))
2421
2522 # Filter out stopwords from sentences
2623 eng_tok_filtered_lines , eng_tok_origpos_lines = filter_eng_by_stopwords (eng_lines , eng_sw_fn )
27-
28- # Save for post-processing
29- with open (eng_tok_pos_fn , 'w' ) as f :
30- for i , l in enumerate (eng_tok_origpos_lines ):
31- assert l .strip (), '!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i
32- f .write (l + '\n ' )
24+ for i , line in enumerate (eng_tok_origpos_lines ):
25+ if not line .strip ():
26+ raise ValueError ('!!! ERROR Empty line# %d. This will cause issues and must be fixed !!!' % i )
3327
3428 # Stem sentence tokens
35- eng_tok_stemmed_lines = [stem_4_letters_line (l ) for l in eng_tok_filtered_lines ]
29+ eng_preproc_lines = [stem_4_letters_line (l ) for l in eng_tok_filtered_lines ]
3630
3731 # Process the AMR data / remove stopwords
3832 amr_linear_lines , amr_tuple_lines = get_lineartok_with_rel (amr_lines , amr_sw_fn )
3933
40- # Save for post-processing
41- with open (amr_tuple_fn , 'w' ) as f :
42- for l in amr_tuple_lines :
43- f .write (l + '\n ' )
44-
4534 # Stem the AMR lines
46- amr_linear_stemmed_lines = []
35+ amr_preproc_lines = []
4736 for line in amr_linear_lines :
4837 new_tokens = []
4938 for token in line .split ():
5039 token = re .sub (r'\-[0-9]{2,3}$' , '' , token )
5140 token = token .replace ('"' , '' )
5241 token = stem_4_letters_word (token ).strip ()
5342 new_tokens .append (token )
54- amr_linear_stemmed_lines .append (' ' .join (new_tokens ))
43+ amr_preproc_lines .append (' ' .join (new_tokens ))
5544
56- return eng_tok_stemmed_lines , amr_linear_stemmed_lines
45+ # Gather the data
46+ assert len (eng_preproc_lines ) == len (amr_preproc_lines )
47+ data = ProcData (eng_lines , amr_lines , eng_tok_origpos_lines , amr_tuple_lines ,
48+ eng_preproc_lines , amr_preproc_lines ,)
49+ return data
5750
5851
5952# Preprocess the training data. This is the similar to inference but add a lot of
6053# extra translation lines from resource files, etc..
61- def preprocess_train (wk_dir , eng_lines , amr_lines , ** kwargs ):
54+ def preprocess_train (eng_lines , amr_lines , ** kwargs ):
6255 repeat_td = kwargs .get ('repeat_td' , 10 ) # 10X is original value from isi aligner
6356 # Resource filenames
6457 res_dir = kwargs .get ('res_dir' , default_res_dir )
@@ -67,10 +60,12 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
6760 amr_id_map_fn = kwargs .get ('amr_id_map_fn' , os .path .join (res_dir , 'amr_id_map.txt' ))
6861
6962 # Run the inference process which creates the basic translation data
70- eng_tok_stemmed_lines , amr_linear_stemmed_lines = preprocess_infer (wk_dir , eng_lines , amr_lines , ** kwargs )
63+ data = preprocess_infer (eng_lines , amr_lines , ** kwargs )
64+ eng_preproc_lines = data .eng_preproc_lines
65+ amr_preproc_lines = data .amr_preproc_lines
7166
7267 # Get tokens common between the two datasets (obvious translations
73- common_tok_lines = get_id_mapping_uniq (eng_tok_stemmed_lines , amr_linear_stemmed_lines )
68+ common_tok_lines = get_id_mapping_uniq (eng_preproc_lines , amr_preproc_lines )
7469 eng_td_lines = common_tok_lines [:] # copy
7570
7671 # Append the second field in prep-roles.id.txt
@@ -101,8 +96,8 @@ def preprocess_train(wk_dir, eng_lines, amr_lines, **kwargs):
10196
10297 # Create the final training data using the original sentences
10398 # and 10X copies of the additional data (other translations)
104- eng_td_lines = eng_tok_stemmed_lines + [l for _ in range (repeat_td ) for l in eng_td_lines ]
105- amr_td_lines = amr_linear_stemmed_lines + [l for _ in range (repeat_td ) for l in amr_td_lines ]
106- assert len (eng_td_lines ) == len (amr_td_lines )
99+ data . eng_preproc_lines += [l for _ in range (repeat_td ) for l in eng_td_lines ]
100+ data . amr_preproc_lines += [l for _ in range (repeat_td ) for l in amr_td_lines ]
101+ assert len (data . eng_preproc_lines ) == len (data . amr_preproc_lines )
107102
108- return eng_td_lines , amr_td_lines
103+ return data
0 commit comments