diff --git a/.gitignore b/.gitignore index 1573498..61bb41f 100644 --- a/.gitignore +++ b/.gitignore @@ -2,7 +2,7 @@ *DS_Store data/ experiments/ -configs/ +#configs/ ._data # Byte-compiled / optimized / DLL files diff --git a/README.md b/README.md index 6c3b80c..8b850c4 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ In this repository, we explore different strategies for automatic transcription ## Model architecture: -![base_model](https://i.ibb.co/sm3P2Bq/Screenshot-2020-04-14-at-16-19-10.png) +![base_model](images/base_model.png) ### Available Encoders: - [BERT](https://arxiv.org/abs/1810.04805) diff --git a/configs/README.md b/configs/README.md new file mode 100644 index 0000000..6ad0ba6 --- /dev/null +++ b/configs/README.md @@ -0,0 +1 @@ +Folder used to save config files for the different systems/experiments. \ No newline at end of file diff --git a/configs/mlm_roberta.yaml b/configs/mlm_roberta.yaml new file mode 100644 index 0000000..72e9f84 --- /dev/null +++ b/configs/mlm_roberta.yaml @@ -0,0 +1,36 @@ +# MLM Domain adaptation of the RoBERTa base model. + +monitor: perplexity # Metric we want to use to decide earlystopping. +metric_mode: min # If we want to minimize or maximize the metric. +period: 1 # Interval (number of epochs) between checkpoints. + +early_stopping: True # Flag to activate EarlyStopping +patience: 2 # Number of epochs with no improvement after which training will be stopped. +min_delta: 0.0 # Minimum change in the monitored quantity. +save_top_k: 3 # The best k models according to the quantity monitored will be saved. +min_epochs: 1 # Min number of epochs. +max_epochs: 3 # Max number of epochs. + +gradient_clip_val: 1.0 # Max norm of the gradients. +gpus: 1 # Number of GPUs to use +distributed_backend: dp # PyTorch Lightning Distributed training option +batch_size: 4 # Batch size to be used. +accumulate_grad_batches: 3 # Gradient Accumulation. +loader_workers: 3 # Number of workers to load data. + +optimizer: Adam # Optimizer to be used. +learning_rate: 0.00001 # Learning rate for the language model. +scheduler: constant # Leanring rate scheduler to be used. + +data_type: txt +train_path: path/to/train/documents.txt # Path to the Ted talks training documents. +dev_path: path/to/dev/documents.txt # Path to the Ted talks dev documents. +test_path: path/to/test/documents.txt # Path to the Ted talks test documents. +train_val_percent_check: 0.1 + +model: MaskedLanguageModel # Model architecture to be used. +nr_frozen_epochs: 0 # Number of epochs we want to keep the encoder model frozen. +loss: cross_entropy # MLM loss function. + +encoder_model: RoBERTa # Encoder model to be used +pretrained_model: roberta.base # Pretrained model to be loaded from huggingface transformers. \ No newline at end of file diff --git a/configs/roberta_punkt.yaml b/configs/roberta_punkt.yaml new file mode 100644 index 0000000..8e61e98 --- /dev/null +++ b/configs/roberta_punkt.yaml @@ -0,0 +1,45 @@ +# Default configs for the Punctuation restauration experiments. + +monitor: slot_error_rate # Metric we want to use to decide earlystopping. +metric_mode: min # If we want to minimize or maximize the metric. +period: 1 # Interval (number of epochs) between checkpoints. + +early_stopping: True # Flag to activate EarlyStopping +patience: 2 # Number of epochs with no improvement after which training will be stopped. +min_delta: 0.0 # Minimum change in the monitored quantity. +save_top_k: 1 # The best k models according to the quantity monitored will be saved. +min_epochs: 3 # Min number of epochs. +max_epochs: 7 # Max number of epochs. + +gradient_clip_val: 1.0 # Max norm of the gradients. +gpus: 1 # Number of GPUs to use +distributed_backend: dp # PyTorch Lightning Distributed training option +batch_size: 4 # Batch size to be used. +accumulate_grad_batches: 3 # Gradient Accumulation. +loader_workers: 3 # Number of workers to load data. + +optimizer: Adam +class_weights: ignore # For highly unbalanced corpora we can set label weights. +ignore_first_title: False # For capitalization we want to ignore word with tag T in the beginning of sentences +ignore_last_tag: False # For punctuation we want to ignore punctuation tags in the end of the sentence +learning_rate: 0.00003 # Learning rate for the classification-head. +encoder_learning_rate: 0.00001 # Learning rate for the encoder model. +scheduler: constant # Learning rate scheduler to be used. + +data_type: csv +train_path: data/ted/train_punkt.csv # Path to the training csv with 'text' and 'tag' columns. +dev_path: data/ted/test_punkt.csv # Path to the validation csv with 'text' and 'tag' columns. +test_path: data/ted/test_punkt.csv # Path to the test csv with 'text' and 'tag' columns. +train_val_percent_check: 0.1 # Percentage of the training used for validation (overfit control). + +model: TransformerTagger # Model architecture to be used. +nr_frozen_epochs: 1 # Number of epochs we want to keep the encoder model frozen. +loss: cross_entropy +encoder_model: RoBERTa # Encoder model to be used (only BERT available atm) +pretrained_model: roberta.base # Pretrained model to be loaded from huggingface transformers. +tag_set: O,S,C # List of tags used in the task at hands. + +dropout: 0.1 # Dropout for the classification-head +layer: mix # Encoder layers we want to use. 'mix' learns to combine all layers. +scalar_mix_dropout: 0.1 # When layer='mix' we can apply layer dropout. +concat_tokens: True # Flag to apply concatenation of the embeddings respective to the boundaries of a gap \ No newline at end of file diff --git a/images/base_model.png b/images/base_model.png new file mode 100644 index 0000000..517951d Binary files /dev/null and b/images/base_model.png differ