diff --git a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py index 92cd9a218afb..0a3e0812a42a 100644 --- a/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py +++ b/src/transformers/models/xlm_roberta_xl/modeling_xlm_roberta_xl.py @@ -1119,6 +1119,13 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti if attention_mask is None: attention_mask = input_ids.new_ones(input_shape) + # Create missing `position_ids` on the fly + position_ids = None + if model_kwargs.get("position_ids") is None: + position_ids = create_position_ids_from_input_ids( + input_ids, padding_idx=self.config.pad_token_id + ) # placed in kwargs for further processing (see below) + # cut decoder_input_ids if past_key_values is used if past_key_values is not None: past_length = past_key_values[0][0].shape[2] @@ -1131,8 +1138,15 @@ def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attenti remove_prefix_length = input_ids.shape[1] - 1 input_ids = input_ids[:, remove_prefix_length:] - - return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values} + if position_ids is not None: + position_ids = position_ids[:, remove_prefix_length:] + + return { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "past_key_values": past_key_values, + } def _reorder_cache(self, past_key_values, beam_idx): reordered_past = ()