port rls 2.7 fixes (#3637)

blzheng · web-flow · commit 5eec25b217d2 · 2025-04-11T16:59:16.000+08:00
* fix mllama performance regression (#3630)

* fix coverity issues (#3635)
diff --git a/intel_extension_for_pytorch/transformers/models/reference/models.py b/intel_extension_for_pytorch/transformers/models/reference/models.py
@@ -6349,10 +6349,9 @@ def PhiOImageEmbedding_forward(
 
     if self.img_sizes is not None:
         img_sizes = self.img_sizes
-
-    if img_embeds is not None:
-        # convert to bf16
-        img_embeds = img_embeds.to(torch.bfloat16)
+    assert img_embeds is not None
+    # convert to bf16
+    img_embeds = img_embeds.to(torch.bfloat16)
 
     if self.image_attention_mask is not None:
         image_attention_mask = self.image_attention_mask.clone()
@@ -8140,8 +8139,7 @@ def prepare_inputs_for_generation_phi3(
     **kwargs,
 ):
     if past_key_values is not None:
-        cache_length = past_length = past_key_values[0][0].shape[2]
-        max_cache_length = None
+        past_length = past_key_values[0][0].shape[2]
 
         # Keep only the unprocessed tokens:
         # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
@@ -8155,14 +8153,6 @@ def prepare_inputs_for_generation_phi3(
             input_ids = input_ids[:, past_length:]
         # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
-        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-        if (
-            max_cache_length is not None
-            and attention_mask is not None
-            and cache_length + input_ids.shape[1] > max_cache_length
-        ):
-            attention_mask = attention_mask[:, -max_cache_length:]
-
     position_ids = kwargs.get("position_ids", None)
     if attention_mask is not None and position_ids is None:
         # create position_ids on the fly for batch generation
@@ -8208,8 +8198,7 @@ def prepare_inputs_for_generation_phio(
     **kwargs,
 ):
     if past_key_values is not None:
-        cache_length = past_length = past_key_values[0][0].shape[2]
-        max_cache_length = None
+        past_length = past_key_values[0][0].shape[2]
 
         # Keep only the unprocessed tokens:
         # 1 - If the length of the attention_mask exceeds the length of input_ids, then we are in a setting where
@@ -8223,14 +8212,6 @@ def prepare_inputs_for_generation_phio(
             input_ids = input_ids[:, past_length:]
         # 3 - Otherwise (past_length >= input_ids.shape[1]), let's assume input_ids only has unprocessed tokens.
 
-        # If we are about to go beyond the maximum cache length, we need to crop the input attention mask.
-        if (
-            max_cache_length is not None
-            and attention_mask is not None
-            and cache_length + input_ids.shape[1] > max_cache_length
-        ):
-            attention_mask = attention_mask[:, -max_cache_length:]
-
     position_ids = kwargs.get("position_ids", None)
     if attention_mask is not None and position_ids is None:
         # create position_ids on the fly for batch generation
diff --git a/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py b/intel_extension_for_pytorch/transformers/models/reference/modules/decoder.py
@@ -75,9 +75,41 @@ def MllamaVisionEncoderLayer_forward(
     # Self Attention
     residual = hidden_state
     hidden_state = self.input_layernorm(hidden_state)
-    hidden_state, attn_weights = self.self_attn(
-        hidden_state, attention_mask=attention_mask
-    )
+    if output_attentions:
+        hidden_state, attn_weights = self.self_attn(
+            hidden_state, attention_mask=attention_mask
+        )
+    else:
+        query = self.self_attn.q_proj(hidden_state)
+        key = self.self_attn.k_proj(hidden_state)
+        value = self.self_attn.v_proj(hidden_state)
+
+        batch_size, q_seq_len, _ = query.shape
+        _, kv_seq_len, _ = key.shape
+
+        query = query.view(
+            batch_size, q_seq_len, self.self_attn.num_heads, self.self_attn.head_dim
+        )
+        key = key.view(
+            batch_size, kv_seq_len, self.self_attn.num_heads, self.self_attn.head_dim
+        )
+        value = value.view(
+            batch_size, kv_seq_len, self.self_attn.num_heads, self.self_attn.head_dim
+        )
+
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_output = F.scaled_dot_product_attention(
+            query, key, value, attn_mask=attention_mask
+        )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_seq_len, -1)
+
+        hidden_state = self.self_attn.o_proj(attn_output)
+        attn_weights = None
     if self.is_gated:
         hidden_state = self.gate_attn.tanh() * hidden_state
 
diff --git a/tests/cpu/hf_configs/phi4/modeling_phi4mm.py b/tests/cpu/hf_configs/phi4/modeling_phi4mm.py
@@ -360,9 +360,9 @@ def forward(
             img_sizes = self.img_sizes
 
         dtype = self.img_processor.embeddings.patch_embedding.weight.dtype
-        if img_embeds is not None:
-            # convert to bf16
-            img_embeds = img_embeds.to(dtype)
+        assert img_embeds is not None
+        # convert to bf16
+        img_embeds = img_embeds.to(dtype)
 
         if self.image_attention_mask is not None:
             image_attention_mask = self.image_attention_mask.clone()
diff --git a/tests/cpu/hf_configs/phi4/speech_conformer_encoder.py b/tests/cpu/hf_configs/phi4/speech_conformer_encoder.py
@@ -893,6 +893,7 @@ def _bucket_relative_position(self, relative_position):
             relative_position = -torch.min(
                 relative_position, torch.zeros_like(relative_position)
             )
+            num_buckets = self.num_buckets
         # now relative_position is in the range [0, inf)
 
         # half of the buckets are for exact increments in positions
diff --git a/tests/cpu/hf_configs/phi4/vision_siglip_navit.py b/tests/cpu/hf_configs/phi4/vision_siglip_navit.py
@@ -1894,7 +1894,7 @@ def forward(
                 text_outputs,
                 vision_outputs,
             )
-            return ((loss,) + output) if loss is not None else output
+            return output
 
         return SiglipOutput(
             loss=loss,
diff --git a/tests/cpu/test_paged_attention_fp8.py b/tests/cpu/test_paged_attention_fp8.py
@@ -40,7 +40,7 @@ def create_kv_caches(
                 value_cache = torch.empty(size=value_cache_shape, dtype=dtype)
                 value_cache.uniform_(-scale, scale)
             else:
-                value_cache = torch.zeros(size=key_cache_shape, dtype=dtype)
+                value_cache = torch.zeros(size=value_cache_shape, dtype=dtype)
             value_caches.append(value_cache)
         return key_caches, value_caches
 

Original file line number	Diff line number	Diff line change
`@@ -893,6 +893,7 @@ def _bucket_relative_position(self, relative_position):`
`893`	`893`	`relative_position = -torch.min(`
`894`	`894`	`relative_position, torch.zeros_like(relative_position)`
`895`	`895`	`)`
	`896`	`+ num_buckets = self.num_buckets`
`896`	`897`	`# now relative_position is in the range [0, inf)`
`897`	`898`
`898`	`899`	`# half of the buckets are for exact increments in positions`
Original file line number	Diff line number	Diff line change
`@@ -1894,7 +1894,7 @@ def forward(`
`1894`	`1894`	`text_outputs,`
`1895`	`1895`	`vision_outputs,`
`1896`	`1896`	`)`
`1897`		`- return ((loss,) + output) if loss is not None else output`
	`1897`	`+ return output`
`1898`	`1898`
`1899`	`1899`	`return SiglipOutput(`
`1900`	`1900`	`loss=loss,`