@@ -6349,10 +6349,9 @@ def PhiOImageEmbedding_forward(
6349
6349
6350
6350
if self .img_sizes is not None :
6351
6351
img_sizes = self .img_sizes
6352
-
6353
- if img_embeds is not None :
6354
- # convert to bf16
6355
- img_embeds = img_embeds .to (torch .bfloat16 )
6352
+ assert img_embeds is not None
6353
+ # convert to bf16
6354
+ img_embeds = img_embeds .to (torch .bfloat16 )
6356
6355
6357
6356
if self .image_attention_mask is not None :
6358
6357
image_attention_mask = self .image_attention_mask .clone ()
@@ -8140,8 +8139,7 @@ def prepare_inputs_for_generation_phi3(
8140
8139
** kwargs ,
8141
8140
):
8142
8141
if past_key_values is not None :
8143
- cache_length = past_length = past_key_values [0 ][0 ].shape [2 ]
8144
- max_cache_length = None
8142
+ past_length = past_key_values [0 ][0 ].shape [2 ]
8145
8143
8146
8144
# Keep only the unprocessed tokens:
8147
8145
# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
@@ -8155,14 +8153,6 @@ def prepare_inputs_for_generation_phi3(
8155
8153
input_ids = input_ids [:, past_length :]
8156
8154
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
8157
8155
8158
- # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
8159
- if (
8160
- max_cache_length is not None
8161
- and attention_mask is not None
8162
- and cache_length + input_ids .shape [1 ] > max_cache_length
8163
- ):
8164
- attention_mask = attention_mask [:, - max_cache_length :]
8165
-
8166
8156
position_ids = kwargs .get ("position_ids" , None )
8167
8157
if attention_mask is not None and position_ids is None :
8168
8158
# create position_ids on the fly for batch generation
@@ -8208,8 +8198,7 @@ def prepare_inputs_for_generation_phio(
8208
8198
** kwargs ,
8209
8199
):
8210
8200
if past_key_values is not None :
8211
- cache_length = past_length = past_key_values [0 ][0 ].shape [2 ]
8212
- max_cache_length = None
8201
+ past_length = past_key_values [0 ][0 ].shape [2 ]
8213
8202
8214
8203
# Keep only the unprocessed tokens:
8215
8204
# 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
@@ -8223,14 +8212,6 @@ def prepare_inputs_for_generation_phio(
8223
8212
input_ids = input_ids [:, past_length :]
8224
8213
# 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
8225
8214
8226
- # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
8227
- if (
8228
- max_cache_length is not None
8229
- and attention_mask is not None
8230
- and cache_length + input_ids .shape [1 ] > max_cache_length
8231
- ):
8232
- attention_mask = attention_mask [:, - max_cache_length :]
8233
-
8234
8215
position_ids = kwargs .get ("position_ids" , None )
8235
8216
if attention_mask is not None and position_ids is None :
8236
8217
# create position_ids on the fly for batch generation
0 commit comments