Skip to content

LTx2.3 train output noise, pls help #165

@GuardSkill

Description

I use trainer to train ltx2.3, but it output noise in first sampling( this is not trained). i use ltx2.0 ,it won't samle noise.

# =============================================================================
# LTX-2 Audio-Video LoRA Training Configuration
# =============================================================================
#
# This configuration is for training LoRA adapters on the LTX-2 model for
# text-to-video generation. It supports both video-only and joint audio-video
# training modes.
#
# Use this configuration when you want to:
#   - Fine-tune LTX-2 on your own video dataset
#   - Train with or without audio generation
#   - Create custom video generation styles or audiovisual concepts
#
# Dataset structure for text-to-video training:
#   preprocessed_data_root/
#   ├── latents/        # Video latents (VAE-encoded videos)
#   ├── conditions/     # Text embeddings for each video
#   └── audio_latents/  # Audio latents (only if with_audio: true)
#
# =============================================================================

# -----------------------------------------------------------------------------
# Model Configuration
# -----------------------------------------------------------------------------
# Specifies the base model to fine-tune and the training mode.
model:
  # Path to the LTX-2 model checkpoint (.safetensors file)
  # This should be a local path to your downloaded model
  model_path: "/root/lisiyuan/Models/LTX-2.3-fp8/ltx-2.3-22b-dev-fp8.safetensors"

  # Path to the text encoder model directory
  # For LTX-2, this is typically the Gemma-based text encoder
  text_encoder_path: "/root/lisiyuan/Models/gemma-3-12b-it-qat-q4_0-unquantized"

  # Training mode: "lora" for efficient adapter training, "full" for full fine-tuning
  # LoRA is recommended for most use cases (faster, less memory, prevents overfitting)
  training_mode: "lora"

  # Optional: Path to resume training from a checkpoint
  # Can be a checkpoint file (.safetensors) or directory (uses latest checkpoint)
  load_checkpoint: null

# -----------------------------------------------------------------------------
# LoRA Configuration
# -----------------------------------------------------------------------------
# Controls the Low-Rank Adaptation parameters for efficient fine-tuning.
lora:
  # Rank of the LoRA matrices (higher = more capacity but more parameters)
  # Typical values: 8, 16, 32, 64. Start with 32 for general fine-tuning.
  rank: 32

  # Alpha scaling factor (usually set equal to rank)
  # The effective scaling is alpha/rank, so alpha=rank means scaling of 1.0
  alpha: 32

  # Dropout probability for LoRA layers (0.0 = no dropout)
  # Can help with regularization if overfitting occurs
  dropout: 0.0

  # Which transformer modules to apply LoRA to
  # The LTX-2 transformer has separate attention and FFN blocks for video and audio:
  #
  # VIDEO MODULES:
  #   - attn1.to_k, attn1.to_q, attn1.to_v, attn1.to_out.0  (video self-attention)
  #   - attn2.to_k, attn2.to_q, attn2.to_v, attn2.to_out.0  (video cross-attention to text)
  #   - ff.net.0.proj, ff.net.2                             (video feed-forward)
  #
  # AUDIO MODULES:
  #   - audio_attn1.to_k, audio_attn1.to_q, audio_attn1.to_v, audio_attn1.to_out.0  (audio self-attention)
  #   - audio_attn2.to_k, audio_attn2.to_q, audio_attn2.to_v, audio_attn2.to_out.0  (audio cross-attention to text)
  #   - audio_ff.net.0.proj, audio_ff.net.2                                         (audio feed-forward)
  #
  # AUDIO-VIDEO CROSS-ATTENTION MODULES (for cross-modal interaction):
  #   - audio_to_video_attn.to_k, audio_to_video_attn.to_q, audio_to_video_attn.to_v, audio_to_video_attn.to_out.0
  #       (Q from video, K/V from audio - allows video to attend to audio features)
  #   - video_to_audio_attn.to_k, video_to_audio_attn.to_q, video_to_audio_attn.to_v, video_to_audio_attn.to_out.0
  #       (Q from audio, K/V from video - allows audio to attend to video features)
  #
  # Using short patterns like "to_k" matches ALL attention modules (video, audio, and cross-modal).
  # For audio-video training, this is the recommended approach.
  target_modules:
    # Attention layers (matches both video and audio branches)
    - "to_k"
    - "to_q"
    - "to_v"
    - "to_out.0"
    # Uncomment below to also train feed-forward layers (can increase the LoRA's capacity):
    # - "ff.net.0.proj"
    # - "ff.net.2"
    # - "audio_ff.net.0.proj"
    # - "audio_ff.net.2"

# -----------------------------------------------------------------------------
# Training Strategy Configuration
# -----------------------------------------------------------------------------
# Defines the text-to-video training approach.
training_strategy:
  # Strategy name: "text_to_video" for standard text-to-video training
  name: "text_to_video"

  # Probability of conditioning on the first frame during training
  # Higher values train the model to perform better in image-to-video (I2V) mode,
  # where a clean first frame is provided and the model generates the rest of the video
  # Increase this value to train the model to perform better in image-to-video (I2V) mode
  first_frame_conditioning_p: 0.85

  # Enable joint audio-video training
  # Set to true if your dataset includes audio and you want to train the audio branch
  with_audio: false

  # Directory name (within preprocessed_data_root) containing audio latents
  # Only used when with_audio is true
  audio_latents_dir: "audio_latents"

# -----------------------------------------------------------------------------
# Optimization Configuration
# -----------------------------------------------------------------------------
# Controls the training optimization parameters.
optimization:
  # Learning rate for the optimizer
  # Typical range for LoRA: 1e-5 to 1e-4
  learning_rate: 1e-4

  # Total number of training steps
  steps: 2000

  # Batch size per GPU
  # Reduce if running out of memory
  batch_size: 1

  # Number of gradient accumulation steps
  # Effective batch size = batch_size * gradient_accumulation_steps * num_gpus
  gradient_accumulation_steps: 1

  # Maximum gradient norm for clipping (helps training stability)
  max_grad_norm: 1.0

  # Optimizer type: "adamw" (standard) or "adamw8bit" (memory-efficient)
  optimizer_type: "adamw"

  # Learning rate scheduler type
  # Options: "constant", "linear", "cosine", "cosine_with_restarts", "polynomial"
  scheduler_type: "linear"

  # Additional scheduler parameters (depends on scheduler_type)
  scheduler_params: { }

  # Enable gradient checkpointing to reduce memory usage
  # Recommended for training with limited GPU memory
  enable_gradient_checkpointing: true

# -----------------------------------------------------------------------------
# Acceleration Configuration
# -----------------------------------------------------------------------------
# Hardware acceleration and memory optimization settings.
acceleration:
  # Mixed precision training mode
  # Options: "no" (fp32), "fp16" (half precision), "bf16" (bfloat16, recommended)
  mixed_precision_mode: "bf16"

  # Model quantization for reduced memory usage
  # Options: null (none), "int8-quanto", "int4-quanto", "int2-quanto", "fp8-quanto", "fp8uz-quanto"
  quantization: fp8-quanto

  # Load text encoder in 8-bit precision to save memory
  # Useful when GPU memory is limited
  load_text_encoder_in_8bit: false

# -----------------------------------------------------------------------------
# Data Configuration
# -----------------------------------------------------------------------------
# Specifies the training data location and loading parameters.
data:
  # Root directory containing preprocessed training data
  # Should contain: latents/, conditions/, and optionally audio_latents/
  preprocessed_data_root: "/root/lisiyuan/LTX-2/packages/ltx-trainer/.precomputed_ltx23"

  # Number of worker processes for data loading
  # Used for parallel data loading to speed up data loading
  num_dataloader_workers: 2

# -----------------------------------------------------------------------------
# Validation Configuration
# -----------------------------------------------------------------------------
# Controls validation video generation during training.
# NOTE: Validation sampling use simplified inference pipelines and prioritizes speed over
# maximum quality. For production-quality inference, use `packages/ltx-pipelines`.
validation:
  # Text prompts for validation video generation
  # Provide prompts representative of your training data
  # LTX-2 prefers longer, detailed prompts that describe both visual content and audio
  prompts:
    - "PETANIMATION,A anime charater walk in a white backgroud."
    - "PETANIMATION, a cute, cartoonish mushroom character. climbing.  "

  # Negative prompt to avoid unwanted artifacts
  negative_prompt: "worst quality, inconsistent motion, blurry, jittery, distorted"

  # Optional: First frame images for image-to-video validation
  # If provided, must have one image per prompt
  images: null

  # Output video dimensions [width, height, frames]
  # Width and height must be divisible by 32
  # Frames must satisfy: frames % 8 == 1 (e.g., 1, 9, 17, 25, 33, 41, 49, 57, 65, 73, 81, 89, ...)
  video_dims: [ 576, 576, 57 ]

  # Frame rate for generated videos
  frame_rate: 25.0

  # Random seed for reproducible validation outputs
  seed: 42

  # Number of denoising steps for validation inference
  # Higher values = better quality but slower generation
  inference_steps: 30

  # Generate validation videos every N training steps
  # Set to null to disable validation during training
  interval: 100

  # Number of videos to generate per prompt
  videos_per_prompt: 1

  # Classifier-free guidance scale
  # Higher values = stronger adherence to prompt but may introduce artifacts
  guidance_scale: 4.0

  # STG (Spatio-Temporal Guidance) parameters for improved video quality
  # STG is combined with CFG for better temporal coherence
  stg_scale: 1.0  # Recommended: 1.0 (0.0 disables STG)
  stg_blocks: [29]  # Recommended: single block 29
  stg_mode: "stg_av"  # "stg_av" perturbs both audio and video, "stg_v" video only

  # Whether to generate audio in validation samples
  # Independent of training_strategy.with_audio - you can generate audio
  # in validation even when not training the audio branch
  generate_audio: true

  # Skip validation at the beginning of training (step 0)
  skip_initial_validation: false

# -----------------------------------------------------------------------------
# Checkpoint Configuration
# -----------------------------------------------------------------------------
# Controls model checkpoint saving during training.
checkpoints:
  # Save a checkpoint every N steps
  # Set to null to disable intermediate checkpoints
  interval: 250

  # Number of most recent checkpoints to keep
  # Set to -1 to keep all checkpoints
  keep_last_n: -1

  # Precision to use when saving checkpoint weights
  # Options: "bfloat16" (default, smaller files) or "float32" (full precision)
  precision: "bfloat16"

# -----------------------------------------------------------------------------
# Flow Matching Configuration
# -----------------------------------------------------------------------------
# Parameters for the flow matching training objective.
flow_matching:
  # Timestep sampling mode
  # "shifted_logit_normal" is recommended for LTX-2 models
  timestep_sampling_mode: "shifted_logit_normal"

  # Additional parameters for timestep sampling
  timestep_sampling_params: { }

# -----------------------------------------------------------------------------
# Hugging Face Hub Configuration
# -----------------------------------------------------------------------------
# Settings for uploading trained models to the Hugging Face Hub.
hub:
  # Whether to push the trained model to the Hub
  push_to_hub: false

  # Repository ID on Hugging Face Hub (e.g., "username/my-lora-model")
  # Required if push_to_hub is true
  hub_model_id: null

# -----------------------------------------------------------------------------
# Weights & Biases Configuration
# -----------------------------------------------------------------------------
# Settings for experiment tracking with W&B.
wandb:
  # Enable W&B logging
  enabled: false

  # W&B project name
  project: "ltx-2-trainer"

  # W&B username or team (null uses default account)
  entity: null

  # Tags to help organize runs
  tags: [ "ltx2", "lora" ]

  # Log validation videos to W&B
  log_validation_videos: true

# -----------------------------------------------------------------------------
# General Configuration
# -----------------------------------------------------------------------------
# Global settings for the training run.

# Random seed for reproducibility
seed: 42

# Directory to save outputs (checkpoints, validation videos, logs)
output_dir: "outputs/ltx23_av_lora"

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions