1414# limitations under the License.
1515r"""Combine multiple training corpus into a single training corpus.
1616
17+ Currently only support the case that multiple corpus share the same
18+ configurables except the "modules" field.
19+
1720Usage: we'd like to combine training corpus corpus1 and corpus2 into
1821combinedcorpus; we first structure the files as follows:
1922
2730compiler_opt/tools/combine_training_corpus.py \
2831 --root_dir=$PATH_TO_combinedcorpus
2932
30- generates combinedcorpus/module_path file. In this way corpus1 and
31- corpus2 are combined into combinedcorpus.
33+ generates combinedcorpus/corpus_description.json file. In this way corpus1
34+ and corpus2 are combined into combinedcorpus.
3235"""
3336
37+ import json
3438import os
3539
3640from absl import app
4347
4448FLAGS = flags .FLAGS
4549
46- _FILE_NAME = 'module_paths '
50+ _FILE_NAME = 'corpus_description.json '
4751
4852
4953def main (argv ):
5054 if len (argv ) > 1 :
5155 raise app .UsageError ('Too many command-line arguments.' )
5256
5357 module_names = []
58+ output_corpus_description = {}
5459
5560 for sub_dir in tf .io .gfile .listdir (FLAGS .root_dir ):
5661 path = os .path .join (FLAGS .root_dir , sub_dir , _FILE_NAME )
@@ -62,12 +67,20 @@ def main(argv):
6267 continue
6368
6469 with tf .io .gfile .GFile (path , 'r' ) as f :
65- module_names .extend (
66- [os .path .join (sub_dir , name .rstrip ('\n ' )) for name in f ])
70+ corpus_description = json .load (f )
71+ module_names .extend ([
72+ os .path .join (sub_dir , name ) for name in corpus_description ['modules' ]
73+ ])
74+ del corpus_description ['modules' ]
75+ if len (output_corpus_description ) == 0 :
76+ output_corpus_description = corpus_description
77+ elif corpus_description != output_corpus_description :
78+ raise ValueError ('Input corpora differ more than modules.' )
79+
80+ output_corpus_description ['modules' ] = module_names
6781
6882 with tf .io .gfile .GFile (os .path .join (FLAGS .root_dir , _FILE_NAME ), 'w' ) as f :
69- for module in module_names :
70- f .write (module + '\n ' )
83+ json .dump (output_corpus_description , f , indent = 2 )
7184
7285
7386if __name__ == '__main__' :
0 commit comments