microsoft
/

Phi-3-small-128k-instruct

@@ -215,7 +215,6 @@ class Phi3SmallSelfAttention(nn.Module):
                 f"Layer {layer_idx + 1} is using dense attention since it is divisible by "
                 f"{self.config.dense_attention_every_n_layers}"
             )
-            assert is_flash_attention_available, "Flash Attention is not available, but is needed for dense attention"
         else:
             # BlockSparse related Parameters
             self.blocksparse_params = BlockSparseParams.from_config(config)
@@ -419,6 +418,8 @@ class Phi3SmallSelfAttention(nn.Module):
             avoid doing that.
         """
         attention_dropout_prob = self.attention_dropout_rate if self.training else 0.0
         # Get into the correct shape for the Flash Attention API
         # shape: (bs, seq_len, nqp, hn)

                 f"Layer {layer_idx + 1} is using dense attention since it is divisible by "
                 f"{self.config.dense_attention_every_n_layers}"
             )
         else:
             # BlockSparse related Parameters
             self.blocksparse_params = BlockSparseParams.from_config(config)
             avoid doing that.
         """
+        assert is_flash_attention_available, "Flash Attention is not available, but is needed for dense attention"
         attention_dropout_prob = self.attention_dropout_rate if self.training else 0.0
         # Get into the correct shape for the Flash Attention API
         # shape: (bs, seq_len, nqp, hn)

Move flash_attn assert from __init__ into calling func