Here, you can find an overview about all configuration options available in biotrainer. For more details, please refer to the descriptions in config_file_options.
# General Options
protocol: residue_to_class | residues_to_class | residues_to_value | sequence_to_class | sequence_to_value
interaction: multiply | concat # Default: None
seed: 1234 # Default: 42
device: cpu | cuda | cuda:0 | cuda:1 # Default: Uses cuda if available, otherwise cpu
save_split_ids: True | False # Default: False
ignore_file_inconsistencies: True | False # Default: False
output_dir: path/to/output/directory # Default: output
bootstrapping_iterations: 55 # Default: 30, Disable: 0
sanity_check: True | False # Default: True
external_writer: tensorboard | none # Default: tensorboard, none deactivates it
# Input Files
sequence_file: path/to/sequence_file.fasta # Required for all protocols
labels_file: path/to/labels_file.fasta # Required for per-residue protocols
mask_file: path/to/mask_file.fasta # Optional for per-residue protocols
# Embeddings
embedder_name: Rostlab/prot_t5_xl_uniref50 | ElnaggarLab/ankh-large | user/your-hf-model | one_hot_encoding | your_model.onnx
use_half_precision: True | False # Default: False
embeddings_file: path/to/embeddings.h5 # Optional pre-computed embeddings file
dimension_reduction_method: umap | tsne # Default: None, only possible for per-sequence embeddings
n_reduced_components: 5 # Default: None, requires dimension_reduction_method to be set
custom_tokenizer_config: tokenizer_config.json # If no config is provided, the default T5Tokenizer is used. Only applicable if using an onnx embedder
# Model Parameters
model_choice: FNN | CNN | LogReg | LightAttention # Protocol-dependent default
optimizer_choice: adam # Default: adam
learning_rate: 1e-3 # Default: 1e-3
dropout_rate: 0.25 # Default: 0.25
loss_choice: cross_entropy_loss | mean_squared_error # Protocol-dependent default
use_class_weights: True | False # Default: False
disable_pytorch_compile: True | False # Default: True
# Training Parameters
num_epochs: 200 # Default: 200
patience: 10 # Default: 10
epsilon: 1e-3 # Default: 1e-3
batch_size: 128 # Default: 128
shuffle: True | False # Default: True
# Cross Validation
cross_validation_config:
method: hold_out | k_fold | leave_p_out
# k-fold specific options
k: 5 # Required for k-fold, k >= 2
stratified: True | False # Default: False
repeat: 3 # Default: 1
nested: True | False # Default: False
nested_k: 3 # Required for nested k-fold, nested_k >= 2
search_method: random_search | grid_search
n_max_evaluations_random: 3 # For random search
# leave-p-out specific option
p: 5 # p >= 1
# Common option
choose_by: loss | accuracy | precision | recall # Default: loss
# Special Training Modes
auto_resume: True | False # Default: False
pretrained_model: path/to/model_checkpoint.safetensors # Mutually exclusive with auto_resume
limited_sample_size: 100 # Default: -1 (all options)
# HuggingFace Dataset Integration
hf_dataset:
path: huggingface_user_name/repository_name # Required
subset: subset_name # Optional
sequence_column: sequences_column_name # Required
target_column: targets_column_name # Required
mask_column: mask_column_name # Optional