Skip to content

Latest commit

 

History

History
79 lines (67 loc) · 3.29 KB

config_file_options_overview.md

File metadata and controls

79 lines (67 loc) · 3.29 KB

Configuration File Options Overview

Here, you can find an overview about all configuration options available in biotrainer. For more details, please refer to the descriptions in config_file_options.

# General Options
protocol: residue_to_class | residues_to_class | residues_to_value | sequence_to_class | sequence_to_value
interaction: multiply | concat  # Default: None
seed: 1234  # Default: 42
device: cpu | cuda | cuda:0 | cuda:1  # Default: Uses cuda if available, otherwise cpu
save_split_ids: True | False  # Default: False
ignore_file_inconsistencies: True | False  # Default: False
output_dir: path/to/output/directory  # Default: output
bootstrapping_iterations: 55  # Default: 30, Disable: 0
sanity_check: True | False  # Default: True
external_writer: tensorboard | none  # Default: tensorboard, none deactivates it

# Input Files
sequence_file: path/to/sequence_file.fasta  # Required for all protocols
labels_file: path/to/labels_file.fasta  # Required for per-residue protocols
mask_file: path/to/mask_file.fasta  # Optional for per-residue protocols

# Embeddings
embedder_name: Rostlab/prot_t5_xl_uniref50 | ElnaggarLab/ankh-large | user/your-hf-model | one_hot_encoding | your_model.onnx
use_half_precision: True | False  # Default: False
embeddings_file: path/to/embeddings.h5  # Optional pre-computed embeddings file
dimension_reduction_method: umap | tsne  # Default: None, only possible for per-sequence embeddings
n_reduced_components: 5  # Default: None, requires dimension_reduction_method to be set
custom_tokenizer_config: tokenizer_config.json  # If no config is provided, the default T5Tokenizer is used. Only applicable if using an onnx embedder

# Model Parameters
model_choice: FNN | CNN | LogReg | LightAttention  # Protocol-dependent default
optimizer_choice: adam  # Default: adam
learning_rate: 1e-3  # Default: 1e-3
dropout_rate: 0.25  # Default: 0.25
loss_choice: cross_entropy_loss | mean_squared_error  # Protocol-dependent default
use_class_weights: True | False  # Default: False
disable_pytorch_compile: True | False  # Default: True

# Training Parameters
num_epochs: 200  # Default: 200
patience: 10  # Default: 10
epsilon: 1e-3  # Default: 1e-3
batch_size: 128  # Default: 128
shuffle: True | False  # Default: True

# Cross Validation
cross_validation_config:
  method: hold_out | k_fold | leave_p_out
  
  # k-fold specific options
  k: 5  # Required for k-fold, k >= 2
  stratified: True | False  # Default: False
  repeat: 3  # Default: 1
  nested: True | False  # Default: False
  nested_k: 3  # Required for nested k-fold, nested_k >= 2
  search_method: random_search | grid_search
  n_max_evaluations_random: 3  # For random search
  
  # leave-p-out specific option
  p: 5  # p >= 1
  
  # Common option
  choose_by: loss | accuracy | precision | recall  # Default: loss

# Special Training Modes
auto_resume: True | False  # Default: False
pretrained_model: path/to/model_checkpoint.safetensors  # Mutually exclusive with auto_resume
limited_sample_size: 100  # Default: -1 (all options)

# HuggingFace Dataset Integration
hf_dataset:
  path: huggingface_user_name/repository_name  # Required
  subset: subset_name  # Optional
  sequence_column: sequences_column_name  # Required
  target_column: targets_column_name  # Required
  mask_column: mask_column_name  # Optional