Skip to content

Commit

Permalink
base model image fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Ricardo Rei authored and Ricardo Rei committed Apr 17, 2020
1 parent 13019fd commit f3b6d46
Show file tree
Hide file tree
Showing 6 changed files with 84 additions and 2 deletions.
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
*DS_Store
data/
experiments/
configs/
#configs/
._data

# Byte-compiled / optimized / DLL files
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ In this repository, we explore different strategies for automatic transcription

## Model architecture:

![base_model](https://i.ibb.co/sm3P2Bq/Screenshot-2020-04-14-at-16-19-10.png)
![base_model](images/base_model.png)

### Available Encoders:
- [BERT](https://arxiv.org/abs/1810.04805)
Expand Down
1 change: 1 addition & 0 deletions configs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Folder used to save config files for the different systems/experiments.
36 changes: 36 additions & 0 deletions configs/mlm_roberta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
# MLM Domain adaptation of the RoBERTa base model.

monitor: perplexity # Metric we want to use to decide earlystopping.
metric_mode: min # If we want to minimize or maximize the metric.
period: 1 # Interval (number of epochs) between checkpoints.

early_stopping: True # Flag to activate EarlyStopping
patience: 2 # Number of epochs with no improvement after which training will be stopped.
min_delta: 0.0 # Minimum change in the monitored quantity.
save_top_k: 3 # The best k models according to the quantity monitored will be saved.
min_epochs: 1 # Min number of epochs.
max_epochs: 3 # Max number of epochs.

gradient_clip_val: 1.0 # Max norm of the gradients.
gpus: 1 # Number of GPUs to use
distributed_backend: dp # PyTorch Lightning Distributed training option
batch_size: 4 # Batch size to be used.
accumulate_grad_batches: 3 # Gradient Accumulation.
loader_workers: 3 # Number of workers to load data.

optimizer: Adam # Optimizer to be used.
learning_rate: 0.00001 # Learning rate for the language model.
scheduler: constant # Leanring rate scheduler to be used.

data_type: txt
train_path: path/to/train/documents.txt # Path to the Ted talks training documents.
dev_path: path/to/dev/documents.txt # Path to the Ted talks dev documents.
test_path: path/to/test/documents.txt # Path to the Ted talks test documents.
train_val_percent_check: 0.1

model: MaskedLanguageModel # Model architecture to be used.
nr_frozen_epochs: 0 # Number of epochs we want to keep the encoder model frozen.
loss: cross_entropy # MLM loss function.

encoder_model: RoBERTa # Encoder model to be used
pretrained_model: roberta.base # Pretrained model to be loaded from huggingface transformers.
45 changes: 45 additions & 0 deletions configs/roberta_punkt.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# Default configs for the Punctuation restauration experiments.

monitor: slot_error_rate # Metric we want to use to decide earlystopping.
metric_mode: min # If we want to minimize or maximize the metric.
period: 1 # Interval (number of epochs) between checkpoints.

early_stopping: True # Flag to activate EarlyStopping
patience: 2 # Number of epochs with no improvement after which training will be stopped.
min_delta: 0.0 # Minimum change in the monitored quantity.
save_top_k: 1 # The best k models according to the quantity monitored will be saved.
min_epochs: 3 # Min number of epochs.
max_epochs: 7 # Max number of epochs.

gradient_clip_val: 1.0 # Max norm of the gradients.
gpus: 1 # Number of GPUs to use
distributed_backend: dp # PyTorch Lightning Distributed training option
batch_size: 4 # Batch size to be used.
accumulate_grad_batches: 3 # Gradient Accumulation.
loader_workers: 3 # Number of workers to load data.

optimizer: Adam
class_weights: ignore # For highly unbalanced corpora we can set label weights.
ignore_first_title: False # For capitalization we want to ignore word with tag T in the beginning of sentences
ignore_last_tag: False # For punctuation we want to ignore punctuation tags in the end of the sentence
learning_rate: 0.00003 # Learning rate for the classification-head.
encoder_learning_rate: 0.00001 # Learning rate for the encoder model.
scheduler: constant # Learning rate scheduler to be used.

data_type: csv
train_path: data/ted/train_punkt.csv # Path to the training csv with 'text' and 'tag' columns.
dev_path: data/ted/test_punkt.csv # Path to the validation csv with 'text' and 'tag' columns.
test_path: data/ted/test_punkt.csv # Path to the test csv with 'text' and 'tag' columns.
train_val_percent_check: 0.1 # Percentage of the training used for validation (overfit control).

model: TransformerTagger # Model architecture to be used.
nr_frozen_epochs: 1 # Number of epochs we want to keep the encoder model frozen.
loss: cross_entropy
encoder_model: RoBERTa # Encoder model to be used (only BERT available atm)
pretrained_model: roberta.base # Pretrained model to be loaded from huggingface transformers.
tag_set: O,S,C # List of tags used in the task at hands.

dropout: 0.1 # Dropout for the classification-head
layer: mix # Encoder layers we want to use. 'mix' learns to combine all layers.
scalar_mix_dropout: 0.1 # When layer='mix' we can apply layer dropout.
concat_tokens: True # Flag to apply concatenation of the embeddings respective to the boundaries of a gap
Binary file added images/base_model.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.

0 comments on commit f3b6d46

Please sign in to comment.