diff --git a/.gitignore b/.gitignore
index 1573498..61bb41f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,7 +2,7 @@
 *DS_Store
 data/
 experiments/
-configs/
+#configs/
 ._data
 
 # Byte-compiled / optimized / DLL files
diff --git a/README.md b/README.md
index 6c3b80c..8b850c4 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ In this repository, we explore different strategies for automatic transcription
 
 ## Model architecture:
 
-![base_model](https://i.ibb.co/sm3P2Bq/Screenshot-2020-04-14-at-16-19-10.png)
+![base_model](images/base_model.png)
 
 ### Available Encoders:
 - [BERT](https://arxiv.org/abs/1810.04805)
diff --git a/configs/README.md b/configs/README.md
new file mode 100644
index 0000000..6ad0ba6
--- /dev/null
+++ b/configs/README.md
@@ -0,0 +1 @@
+Folder used to save config files for the different systems/experiments.
\ No newline at end of file
diff --git a/configs/mlm_roberta.yaml b/configs/mlm_roberta.yaml
new file mode 100644
index 0000000..72e9f84
--- /dev/null
+++ b/configs/mlm_roberta.yaml
@@ -0,0 +1,36 @@
+# MLM Domain adaptation of the RoBERTa base model.
+
+monitor: perplexity                      # Metric we want to use to decide earlystopping.
+metric_mode: min                         # If we want to minimize or maximize the metric.
+period: 1                                # Interval (number of epochs) between checkpoints.
+
+early_stopping: True                     # Flag to activate EarlyStopping
+patience: 2                              # Number of epochs with no improvement after which training will be stopped.  
+min_delta: 0.0                           # Minimum change in the monitored quantity.
+save_top_k: 3                            # The best k models according to the quantity monitored will be saved.
+min_epochs: 1                            # Min number of epochs. 
+max_epochs: 3                            # Max number of epochs. 
+
+gradient_clip_val: 1.0                   # Max norm of the gradients.
+gpus: 1                                  # Number of GPUs to use
+distributed_backend: dp                  # PyTorch Lightning Distributed training option
+batch_size: 4                            # Batch size to be used.
+accumulate_grad_batches: 3               # Gradient Accumulation.
+loader_workers: 3                        # Number of workers to load data.
+
+optimizer: Adam                          # Optimizer to be used.
+learning_rate: 0.00001                   # Learning rate for the language model.
+scheduler: constant                      # Leanring rate scheduler to be used.
+
+data_type: txt
+train_path: path/to/train/documents.txt  # Path to the Ted talks training documents. 
+dev_path: path/to/dev/documents.txt      # Path to the Ted talks dev documents.
+test_path: path/to/test/documents.txt    # Path to the Ted talks test documents.
+train_val_percent_check: 0.1
+
+model: MaskedLanguageModel               # Model architecture to be used.
+nr_frozen_epochs: 0                      # Number of epochs we want to keep the encoder model frozen.
+loss: cross_entropy                      # MLM loss function.
+
+encoder_model: RoBERTa                   # Encoder model to be used
+pretrained_model: roberta.base           # Pretrained model to be loaded from huggingface transformers.
\ No newline at end of file
diff --git a/configs/roberta_punkt.yaml b/configs/roberta_punkt.yaml
new file mode 100644
index 0000000..8e61e98
--- /dev/null
+++ b/configs/roberta_punkt.yaml
@@ -0,0 +1,45 @@
+# Default configs for the Punctuation restauration experiments.
+
+monitor: slot_error_rate            # Metric we want to use to decide earlystopping.
+metric_mode: min                    # If we want to minimize or maximize the metric.
+period: 1                           # Interval (number of epochs) between checkpoints.
+
+early_stopping: True                # Flag to activate EarlyStopping
+patience: 2                         # Number of epochs with no improvement after which training will be stopped.  
+min_delta: 0.0                      # Minimum change in the monitored quantity.
+save_top_k: 1                       # The best k models according to the quantity monitored will be saved.
+min_epochs: 3                       # Min number of epochs. 
+max_epochs: 7                       # Max number of epochs. 
+
+gradient_clip_val: 1.0              # Max norm of the gradients.
+gpus: 1                             # Number of GPUs to use
+distributed_backend: dp             # PyTorch Lightning Distributed training option
+batch_size: 4                       # Batch size to be used.
+accumulate_grad_batches: 3          # Gradient Accumulation.
+loader_workers: 3                   # Number of workers to load data.
+
+optimizer: Adam
+class_weights: ignore               # For highly unbalanced corpora we can set label weights.
+ignore_first_title: False           # For capitalization we want to ignore word with tag T in the beginning of sentences
+ignore_last_tag: False              # For punctuation we want to ignore punctuation tags in the end of the sentence
+learning_rate: 0.00003              # Learning rate for the classification-head.
+encoder_learning_rate: 0.00001      # Learning rate for the encoder model.
+scheduler: constant                 # Learning rate scheduler to be used.
+
+data_type: csv
+train_path: data/ted/train_punkt.csv # Path to the training csv with 'text' and 'tag' columns.
+dev_path: data/ted/test_punkt.csv    # Path to the validation csv with 'text' and 'tag' columns.
+test_path: data/ted/test_punkt.csv   # Path to the test csv with 'text' and 'tag' columns.
+train_val_percent_check: 0.1         # Percentage of the training used for validation (overfit control).
+
+model: TransformerTagger             # Model architecture to be used.
+nr_frozen_epochs: 1                  # Number of epochs we want to keep the encoder model frozen.
+loss: cross_entropy
+encoder_model: RoBERTa               # Encoder model to be used (only BERT available atm)
+pretrained_model: roberta.base       # Pretrained model to be loaded from huggingface transformers.
+tag_set: O,S,C                       # List of tags used in the task at hands.
+ 
+dropout: 0.1                         # Dropout for the classification-head
+layer: mix                           # Encoder layers we want to use. 'mix' learns to combine all layers.
+scalar_mix_dropout: 0.1              # When layer='mix' we can apply layer dropout.
+concat_tokens: True                  # Flag to apply concatenation of the embeddings respective to the boundaries of a gap
\ No newline at end of file
diff --git a/images/base_model.png b/images/base_model.png
new file mode 100644
index 0000000..517951d
Binary files /dev/null and b/images/base_model.png differ