def generate(
prompt_tokens,
prompt_features_lens,
prompt_features,
prompt_rms,
text,
model,
vocoder,
tokenizer,
num_step=4,
guidance_scale=3.0,
speed=1.0,
t_shift=0.5,
target_rms=0.1,
):
tokens = tokenizer.texts_to_token_ids([text])
device = next(model.parameters()).device # Auto-detect device
speed = speed * 1.3
with torch.inference_mode():
(pred_features, _, _, _) = model.sample(
tokens=tokens,
prompt_tokens=prompt_tokens,
prompt_features=prompt_features,
prompt_features_lens=prompt_features_lens,
speed=speed,
t_shift=t_shift,
duration="predict",
num_step=num_step,
guidance_scale=guidance_scale,
)
# Convert to waveform
pred_features = pred_features.permute(0, 2, 1) / 0.1
# FIX: Padding for the Vocoder
# We take the last frame and repeat it 15 times (approx 150ms buffer)
# This gives Vocos enough data to finish the previous sound without cutting it.
last_frame = pred_features[:, :, -1:]
padding_frames = last_frame.repeat(1, 1, 15)
pred_features = torch.cat([pred_features, padding_frames], dim=2)
wav = vocoder.decode(pred_features).squeeze(1).clamp(-1, 1)
# Volume matching
if prompt_rms < target_rms:
wav = wav * (prompt_rms / target_rms)
return wav
Hey, I tried reducing the speed for the
.generate_speechand increasing thedurationof the '.encode_prompt` but it did not fix the issue when using english samples + ref_text.In the end I fixed it by changing the
generatefunction in themodeling_utils.py, specifically by adding padding to the Vocoder to give it enough data to finish the previous sound. Anything else I should have looked at?