Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 18 additions & 36 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,6 @@ python:
- "3.6"
git:
depth: false
addons:
apt:
packages:
# Additional dependencies for im2text and speech2text
- libsox-dev
- libsox-fmt-all
- sox
before_install:
# Install CPU version of PyTorch.
- if [[ $TRAVIS_PYTHON_VERSION == 3.6 ]]; then pip install torch==1.6.0 -f https://download.pytorch.org/whl/cpu/torch_stable.html; fi
Expand Down Expand Up @@ -39,40 +32,29 @@ jobs:
- name: "Unit tests"
# Please also add tests to `test/pull_request_chk.sh`.
script:
- wget -O /tmp/im2text.tgz http://lstm.seas.harvard.edu/latex/im2text_small.tgz; tar zxf /tmp/im2text.tgz -C /tmp/; head /tmp/im2text/src-train.txt > /tmp/im2text/src-train-head.txt; head /tmp/im2text/tgt-train.txt > /tmp/im2text/tgt-train-head.txt; head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt
- wget -O /tmp/speech.tgz http://lstm.seas.harvard.edu/latex/speech.tgz; tar zxf /tmp/speech.tgz -C /tmp/; head /tmp/speech/src-train.txt > /tmp/speech/src-train-head.txt; head /tmp/speech/tgt-train.txt > /tmp/speech/tgt-train-head.txt; head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt
- wget -O /tmp/test_model_speech.pt http://lstm.seas.harvard.edu/latex/model_step_2760.pt
- wget -O /tmp/test_model_im2text.pt http://lstm.seas.harvard.edu/latex/test_model_im2text.pt
- python -m unittest discover
# test nmt preprocessing
- python preprocess.py -train_src data/src-train.txt -train_tgt data/tgt-train.txt -valid_src data/src-val.txt -valid_tgt data/tgt-val.txt -save_data /tmp/data -src_vocab_size 1000 -tgt_vocab_size 1000 && rm -rf /tmp/data*.pt
# test im2text preprocessing
- python preprocess.py -data_type img -shard_size 100 -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-train.txt -train_tgt /tmp/im2text/tgt-train.txt -valid_src /tmp/im2text/src-val.txt -valid_tgt /tmp/im2text/tgt-val.txt -save_data /tmp/im2text/data && rm -rf /tmp/im2text/data*.pt
# test speech2text preprocessing
- python preprocess.py -data_type audio -shard_size 300 -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-train.txt -train_tgt /tmp/speech/tgt-train.txt -valid_src /tmp/speech/src-val.txt -valid_tgt /tmp/speech/tgt-val.txt -save_data /tmp/speech/data && rm -rf /tmp/speech/data*.pt
# test build vocabulary
- PYTHONPATH=$PYTHONPATH:. python onmt/bin/build_vocab.py -config data/data.yaml -save_data /tmp/onmt -n_sample 5000 && rm -rf /tmp/sample

# test nmt training
- python train.py -config data/data.yaml -save_data /tmp/onmt.train.check -src_vocab /tmp/onmt.vocab.src -tgt_vocab /tmp/onmt.vocab.tgt -src_vocab_size 1000 -tgt_vocab_size 1000 -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10
# test nmt training w/copy
- python train.py -config data/data.yaml -save_data /tmp/onmt.train.check_copy -src_vocab /tmp/onmt.vocab.src -tgt_vocab /tmp/onmt.vocab.tgt -src_vocab_size 1000 -tgt_vocab_size 1000 -rnn_size 2 -batch_size 10 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10 -dynamic_dict -copy_attn && rm -rf /tmp/onmt.train.check_copy*.pt
# test nmt training w/align
- python train.py -config data/align_data.yaml -save_data /tmp/onmt.train.check_align -src_vocab /tmp/onmt.vocab.src -tgt_vocab /tmp/onmt.vocab.tgt -src_vocab_size 1000 -tgt_vocab_size 1000 -max_generator_batches 0 -encoder_type transformer -decoder_type transformer -layers 4 -word_vec_size 16 -rnn_size 16 -heads 2 -transformer_ff 64 -lambda_align 0.05 -alignment_layer 2 -alignment_heads 0 -report_every 5 -train_steps 10 && rm -rf /tmp/onmt.train.check_align*.pt
# test Graph neural network training
- python train.py -config data/ggnn_data.yaml -save_data /tmp/onmt.train.check_ggnn -src_seq_length 1000 -tgt_seq_length 30 -dynamic_dict -encoder_type ggnn -layers 2 -decoder_type rnn -rnn_size 256 -learning_rate 0.1 -learning_rate_decay 0.8 -global_attention general -batch_size 32 -word_vec_size 256 -bridge -train_steps 10 -n_edge_types 9 -state_dim 256 -n_steps 10 -n_node 64 && rm -rf /tmp/onmt.train.check_ggnn*.pt

# test nmt translation
- head data/src-test.txt > /tmp/src-test.txt; python translate.py -model onmt/tests/test_model.pt -src /tmp/src-test.txt -verbose
# test nmt ensemble translation
- head data/src-test.txt > /tmp/src-test.txt; python translate.py -model onmt/tests/test_model.pt onmt/tests/test_model.pt -src /tmp/src-test.txt -verbose
# test im2text translation
- head /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python translate.py -data_type img -src_dir /tmp/im2text/images -model /tmp/test_model_im2text.pt -src /tmp/im2text/src-val-head.txt -tgt /tmp/im2text/tgt-val-head.txt -verbose -out /tmp/im2text/trans
# test speech2text translation
- head /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt; python translate.py -data_type audio -src_dir /tmp/speech/an4_dataset -model /tmp/test_model_speech.pt -src /tmp/speech/src-val-head.txt -tgt /tmp/speech/tgt-val-head.txt -verbose -out /tmp/speech/trans; diff /tmp/speech/tgt-val-head.txt /tmp/speech/trans
# test nmt preprocessing and training
- head -500 data/src-val.txt > /tmp/src-val.txt; head -500 data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 2 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10 && rm -rf /tmp/q*.pt
# test nmt preprocessing w/ sharding and training w/copy
- head -50 data/src-val.txt > /tmp/src-val.txt; head -50 data/tgt-val.txt > /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -shard_size 25 -dynamic_dict -save_data /tmp/q -src_vocab_size 1000 -tgt_vocab_size 1000; python train.py -data /tmp/q -rnn_size 2 -batch_size 2 -word_vec_size 5 -report_every 5 -rnn_size 10 -copy_attn -train_steps 10 -pool_factor 10 && rm -rf /tmp/q*.pt

# test Graph neural network preprocessing and training
- cp data/ggnnsrc.txt /tmp/src-val.txt; cp data/ggnntgt.txt /tmp/tgt-val.txt; python preprocess.py -train_src /tmp/src-val.txt -train_tgt /tmp/tgt-val.txt -valid_src /tmp/src-val.txt -valid_tgt /tmp/tgt-val.txt -src_seq_length 1000 -tgt_seq_length 30 -src_vocab data/ggnnsrcvocab.txt -tgt_vocab data/ggnntgtvocab.txt -dynamic_dict -save_data /tmp/q ; python train.py -data /tmp/q -encoder_type ggnn -layers 2 -decoder_type rnn -rnn_size 256 -learning_rate 0.1 -learning_rate_decay 0.8 -global_attention general -batch_size 32 -word_vec_size 256 -bridge -train_steps 10 -src_vocab data/ggnnsrcvocab.txt -n_edge_types 9 -state_dim 256 -n_steps 10 -n_node 64 && rm -rf /tmp/q*.pt

# test im2text preprocessing and training
- head -50 /tmp/im2text/src-val.txt > /tmp/im2text/src-val-head.txt; head -50 /tmp/im2text/tgt-val.txt > /tmp/im2text/tgt-val-head.txt; python preprocess.py -data_type img -src_dir /tmp/im2text/images -train_src /tmp/im2text/src-val-head.txt -train_tgt /tmp/im2text/tgt-val-head.txt -valid_src /tmp/im2text/src-val-head.txt -valid_tgt /tmp/im2text/tgt-val-head.txt -save_data /tmp/im2text/q -tgt_seq_length 100; python train.py -model_type img -data /tmp/im2text/q -rnn_size 2 -batch_size 2 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10 -pool_factor 10 && rm -rf /tmp/im2text/q*.pt
# test speech2text preprocessing and training
- head -100 /tmp/speech/src-val.txt > /tmp/speech/src-val-head.txt; head -100 /tmp/speech/tgt-val.txt > /tmp/speech/tgt-val-head.txt; python preprocess.py -data_type audio -src_dir /tmp/speech/an4_dataset -train_src /tmp/speech/src-val-head.txt -train_tgt /tmp/speech/tgt-val-head.txt -valid_src /tmp/speech/src-val-head.txt -valid_tgt /tmp/speech/tgt-val-head.txt -save_data /tmp/speech/q; python train.py -model_type audio -data /tmp/speech/q -rnn_size 2 -batch_size 2 -word_vec_size 5 -report_every 5 -rnn_size 10 -train_steps 10 -pool_factor 10 && rm -rf /tmp/speech/q*.pt
# test nmt translation
- python translate.py -model onmt/tests/test_model2.pt -src data/morph/src.valid -verbose -batch_size 10 -beam_size 10 -tgt data/morph/tgt.valid -out /tmp/trans; diff data/morph/tgt.valid /tmp/trans
# test nmt translation with beam search
- python translate.py -model onmt/tests/test_model2.pt -src data/morph/src.valid -verbose -batch_size 10 -beam_size 10 -tgt data/morph/tgt.valid -out /tmp/trans; diff data/morph/tgt.valid /tmp/trans && rm /tmp/trans
# test nmt translation with random sampling
- python translate.py -model onmt/tests/test_model2.pt -src data/morph/src.valid -verbose -batch_size 10 -beam_size 1 -seed 1 -random_sampling_topk "-1" -random_sampling_temp 0.0001 -tgt data/morph/tgt.valid -out /tmp/trans; diff data/morph/tgt.valid /tmp/trans
- python translate.py -model onmt/tests/test_model2.pt -src data/morph/src.valid -verbose -batch_size 10 -beam_size 1 -seed 1 -random_sampling_topk "-1" -random_sampling_temp 0.0001 -tgt data/morph/tgt.valid -out /tmp/trans; diff data/morph/tgt.valid /tmp/trans && rm /tmp/trans

# test tool
- PYTHONPATH=$PYTHONPATH:. python tools/extract_vocabulary.py -file /tmp/onmt.train.check.vocab.pt -file_type field -side src -out_file /tmp/onmt.vocab.txt; if ! wc -l /tmp/onmt.vocab.txt | grep -qF "1002"; then echo "wrong word count" && exit 1; else echo "creat vocabulary pass"; fi
- PYTHONPATH=$PYTHONPATH:. python tools/embeddings_to_torch.py -emb_file_enc onmt/tests/sample_glove.txt -emb_file_dec onmt/tests/sample_glove.txt -dict_file /tmp/onmt.train.check.vocab.pt -output_file /tmp/q_gloveembeddings && rm /tmp/q_gloveembeddings; rm /tmp/onmt.train.check.vocab.pt
- PYTHONPATH=$PYTHONPATH:. python tools/extract_embeddings.py -model onmt/tests/test_model.pt
6 changes: 2 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ python setup.py install

Note: If you have MemoryError in the install try to use `pip` with `--no-cache-dir`.

*(Optional)* some advanced features (e.g. working audio, image or pretrained models) requires extra packages, you can install it with:
*(Optional)* some advanced features (e.g. working pretrained models) requires extra packages, you can install it with:
```bash
pip install -r requirements.opt.txt
```
Expand All @@ -59,8 +59,6 @@ Note:
- [Copy and Coverage Attention](http://opennmt.net/OpenNMT-py/options/train.html#model-attention)
- [Pretrained Embeddings](http://opennmt.net/OpenNMT-py/FAQ.html#how-do-i-use-pretrained-embeddings-e-g-glove)
- [Source word features](http://opennmt.net/OpenNMT-py/options/train.html#model-embeddings)
- [Image-to-text processing](http://opennmt.net/OpenNMT-py/im2text.html)
- [Speech-to-text processing](http://opennmt.net/OpenNMT-py/speech2text.html)
- [TensorBoard logging](http://opennmt.net/OpenNMT-py/options/train.html#logging)
- [Multi-GPU training](http://opennmt.net/OpenNMT-py/FAQ.html##do-you-support-multi-gpu)
- [Data preprocessing](http://opennmt.net/OpenNMT-py/options/preprocess.html)
Expand Down Expand Up @@ -159,7 +157,7 @@ Major contributors are:
[Paul Tardy](https://github.com/pltrdy) (Ubiqus / Lium)
[François Hernandez](https://github.com/francoishernandez) (Ubiqus)
[Jianyu Zhan](http://github.com/jianyuzhan) (Shanghai)
[Dylan Flaute](http://github.com/flauted (University of Dayton)
[Dylan Flaute](http://github.com/flauted) (University of Dayton)
and more !

OpenNMT-py belongs to the OpenNMT project along with OpenNMT-Lua and OpenNMT-tf.
Expand Down
6 changes: 6 additions & 0 deletions data/align_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Corpus opts:
data:
corpus_align:
path_src: data/src-val.txt
path_tgt: data/tgt-val.txt
path_align: data/val.src-tgt.talp
8 changes: 8 additions & 0 deletions data/data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Corpus opts:
data:
corpus_1:
path_src: data/src-train.txt
path_tgt: data/tgt-train.txt
valid:
path_src: data/src-val.txt
path_tgt: data/tgt-val.txt
7 changes: 7 additions & 0 deletions data/ggnn_data.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Corpus opts:
src_vocab: data/ggnnsrcvocab.txt
tgt_vocab: data/ggnntgtvocab.txt
data:
ggnn:
path_src: data/ggnnsrc.txt
path_tgt: data/ggnntgt.txt
4 changes: 2 additions & 2 deletions data/morph/tgt.valid
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ o n t o ɡ e n e z a
p r i m j e n a
k r e k e t
a e r o d r o m
o s l o b a d z a t i
o s l o b a d i t i
s t u d i j
k r a t k o ʋ i d
l u p e ʃ t ʋ o
Expand Down Expand Up @@ -135,7 +135,7 @@ x a n
p r i p o ʋ e d a t ʃ
k i n o l o ɡ i j a
a s t r o n o m i j s k i
n e i z ʎ e t ʃ i ʋ o s t
n e i z l j e t ʃ i ʋ o s t
u s l o ʋ a n
s r p s k i
e ʋ o l u t s i o n i z a m
Expand Down
2 changes: 1 addition & 1 deletion data/src-train.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ It is not acceptable that , with the help of the national bureaucracies , Parlia
Federal Master Trainer and Senior Instructor of the Italian Federation of Aerobic Fitness , Group Fitness , Postural Gym , Stretching and Pilates; from 2004 , he has been collaborating with Antiche Terme as personal Trainer and Instructor of Stretching , Pilates and Postural Gym .
" Two soldiers came up to me and told me that if I refuse to sleep with them , they will kill me . They beat me and ripped my clothes .
Yes , we also say that the European budget is not about the duplication of national budgets , but about delivering common goals beyond the capacity of nation states where European funds can realise economies of scale or create synergies .
The name of this site , and program name Title purchased will not be displayed .

They would be abiding by the principle of the UN , which precludes military action except in self-defence , which does not apply here .
rapporteur . - (FR) Mr President , representatives of the Council and the Commission , ladies and gentlemen , I should like to begin by thanking my colleagues , who entrusted me with this report , and the shadow rapporteur for their respective contributions .
Shortly thereafter , Mårthen Cedergran , who had been responsible for vocals , left Bombshell Rocks to establish himself as a tattoo artist .
Expand Down
Loading