NVIDIA · kaix-nv · Feb 22, 2026 · Copilot · Feb 27, 2026 · kevalmorabia97
@@ -1,6 +1,9 @@
 # Attention Sparsity for HuggingFace Models
 
-In this tutorial, we demonstrate how to use NVIDIA Model Optimizer to apply attention sparsity to HuggingFace models. Attention sparsity reduces computational cost by skipping near-zero attention scores during the softmax computation.
+In this tutorial, we demonstrate how to use NVIDIA Model Optimizer to apply attention sparsity to HuggingFace models. Two methods are supported:
+
+- **Skip-Softmax**: Threshold-based skipping of near-zero attention scores during softmax (requires `attn_implementation="eager"`)
+- **Sparse24 Triton**: Fine-grained 2:4 sparsity on attention scores via a fused Triton kernel with autograd support (uses `attn_implementation="modelopt_triton"`)
 
 ## Getting Started
 
@@ -159,6 +162,82 @@ custom_config = {
 model = mtsa.sparsify(model, config=custom_config)
 ```
 
+## Fine-grained 2:4 Sparse Attention
+
+In addition to skip-softmax, Model Optimizer supports **fine-grained 2:4 sparsity** on attention scores via a fused Triton kernel. For every 4 attention scores along the key dimension, the kernel keeps only the top 2 and zeros out the rest — achieving 50% fixed sparsity with no calibration needed.
+
+### Quick Example
+
+```python
+import modelopt.torch.sparsity.attention_sparsity as mtsa
+from modelopt.torch.sparsity.attention_sparsity.config import SPARSE24_TRITON
+
+model = AutoModelForCausalLM.from_pretrained(
+    "meta-llama/Llama-3.1-8B",
+    torch_dtype=torch.bfloat16,
+)
+
+model = mtsa.sparsify(model, config=SPARSE24_TRITON)
+```
+
+> [!Note]
+> Unlike skip-softmax, sparse24 does **not** require `attn_implementation="eager"`. The `mtsa.sparsify` call automatically registers the Triton kernel as `attn_implementation="modelopt_triton"`.
+
+### Running via Command Line
+
+```bash
+python hf_sa.py \
+    --pyt_ckpt_path meta-llama/Llama-3.1-8B \
+    --sparse_attn sparse24_triton \
+    --backend triton
-    --sparse_attn sparse24_triton \
-    --backend triton
+    --sparse_attn sparse24_triton
-    --sparse_attn sparse24_triton \
-    --backend triton
+    --sparse_attn sparse24_triton
+```
+
+### Key Differences from Skip-Softmax
+
+| | Skip-Softmax | Sparse24 Triton |
+|---|---|---|
+| Method | Threshold-based softmax skipping | 2:4 structured sparsity on attention scores |
+| Attention backend | `eager` (patches `F.softmax`) | `modelopt_triton` (fused Triton kernel) |
+| Calibration | Optional (RULER-based) | Not needed (fixed top-2-of-4 selection) |
+| Sparsity ratio | Variable (depends on threshold) | Fixed 50% |
+| Diagonal preservation | N/A | Yes (tiles near the causal diagonal are kept dense) |
+| Training support | No | Yes (autograd-compatible forward/backward) |
+| Decode support | Yes | Yes (same kernel, `is_causal=False`) |
+
+### Training with Sparse24 Attention
+
+The Triton kernel supports autograd. When `requires_grad=True`, the HF integration automatically uses the backward-capable path:
+
+```python
+model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.1-8B", torch_dtype=torch.bfloat16)
+model = mtsa.sparsify(model, config=SPARSE24_TRITON)
+model.train()
+
+# Gradients flow through the sparse attention
+output = model(input_ids=ids, labels=labels)
+output.loss.backward()  # dQ, dK, dV computed via Triton backward kernels
+```
+
+### Custom Sparse24 Configuration
+
+```python
+custom_config = {
+    "sparse_cfg": {
+        "*attn*": {
+            "method": "sparse24_triton",
+            "backend": "triton",
+            "skip_diagonal_blocks": True,  # Keep diagonal tiles dense (recommended)
+            "enable": True,
+        },
+        "default": {"enable": False},
+    },
+}
+
+model = mtsa.sparsify(model, config=custom_config)
+```
+
+Set `skip_diagonal_blocks: False` to apply 2:4 sparsity to all tiles including the diagonal (more aggressive but may hurt quality for local attention patterns).
+
 ## References
 
 - [Model Optimizer Documentation](https://nvidia.github.io/Model-Optimizer/)

@@ -31,6 +31,7 @@
 from modelopt.torch.sparsity.attention_sparsity.config import (
     SKIP_SOFTMAX_CALIB,
     SKIP_SOFTMAX_DEFAULT,
+    SPARSE24_TRITON,
 )
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
 
@@ -43,6 +44,7 @@
 SPARSE_ATTN_CFG_CHOICES = {
     "skip_softmax": SKIP_SOFTMAX_DEFAULT,
     "skip_softmax_calib": SKIP_SOFTMAX_CALIB,
+    "sparse24_triton": SPARSE24_TRITON,
 }
 
 
@@ -144,12 +146,14 @@ def main(args):
 
     print(f"Loading model: {args.pyt_ckpt_path}")
 
-    # Load model and tokenizer
-    # Note: attn_implementation="eager" is required for calibration to work properly
-    # (flash_attention_2 or sdpa would bypass the softmax patching needed for stats collection)
+    # Select attn_implementation based on sparse method:
+    # - skip_softmax methods require "eager" (softmax patching bypassed by flash/sdpa)
+    # - sparse24_triton requires "modelopt_triton" (fused Triton kernel)
+    # No need to specify attn_implementation here — mtsa.sparsify() handles it
+    # automatically based on the sparse config (sets "modelopt_triton" for triton
+    # backend, keeps "eager" for pytorch backend).
     model = AutoModelForCausalLM.from_pretrained(
         args.pyt_ckpt_path,
-        attn_implementation="eager",
         torch_dtype=torch.bfloat16,
     )
     tokenizer = AutoTokenizer.from_pretrained(args.pyt_ckpt_path)
@@ -246,8 +250,8 @@ def main(args):
         "--backend",
         type=str,
         default="pytorch",
-        choices=["pytorch"],
-        help="Backend for sparse attention (default: pytorch). More backends coming soon.",
+        choices=["pytorch", "triton"],
+        help="Backend for sparse attention (default: pytorch). Use 'triton' with sparse24_triton.",
     )
 
     # Sequence length arguments

@@ -72,8 +72,8 @@ class SparseAttentionAttributeConfig(ModeloptBaseConfig):
         title="Backend implementation.",
         description=(
             "Backend to use for sparse attention computation. "
-            "Only 'pytorch' is supported, which uses softmax patching with F.softmax. "
-            "Requires model to be loaded with attn_implementation='eager'."
+            "'pytorch' uses softmax patching with F.softmax (requires attn_implementation='eager'). "
+            "'triton' uses the fused Triton kernel (requires attn_implementation='modelopt_triton')."
         ),
     )
 
@@ -89,10 +89,20 @@ class SparseAttentionAttributeConfig(ModeloptBaseConfig):
         description=(
             "Whether the model uses causal (autoregressive) attention. "
             "If True, sparsity statistics are calculated over the lower triangle only. "
+            "Set to False for cross-attention models. "
             "Defaults to True for decoder-only models like GPT, LLaMA, etc."
         ),
     )
 
+    skip_diagonal_blocks: bool = ModeloptField(
+        default=True,
+        title="Skip diagonal blocks.",
+        description=(
+            "When True, keep diagonal tiles dense for 2:4 sparse attention. "
+            "Only used by sparse24_triton method. Defaults to True."
+        ),
+    )
+
     @field_validator("method")
     @classmethod
     def validate_method(cls, v):
@@ -104,11 +114,12 @@ def validate_method(cls, v):
     @field_validator("backend")
     @classmethod
     def validate_backend(cls, v):
-        """Validate backend is pytorch."""
-        if v != "pytorch":
+        """Validate backend is pytorch or triton."""
+        if v not in ("pytorch", "triton"):
             raise ValueError(
-                f"Invalid backend: {v}. Only 'pytorch' backend is supported. "
-                f"Model must be loaded with attn_implementation='eager'."
+                f"Invalid backend: {v}. Supported backends: 'pytorch' (requires "
+                f"attn_implementation='eager'), 'triton' (requires "
+                f"attn_implementation='modelopt_triton')."
             )
         return v
 
@@ -416,10 +427,24 @@ class FlashSkipSoftmaxConfig(SparseAttentionConfig):
     },
 }
 
+# 2:4 structured sparsity via Triton prefill kernel (prefill-only)
-# 2:4 structured sparsity via Triton prefill kernel (prefill-only)
+# 2:4 structured sparsity via Triton unified attention kernel (prefill + decode)
-# 2:4 structured sparsity via Triton prefill kernel (prefill-only)
+# 2:4 structured sparsity via Triton unified attention kernel (prefill + decode)
+SPARSE24_TRITON = {
+    "sparse_cfg": {
+        "*attn*": {
+            "method": "sparse24_triton",
+            "backend": "triton",
+            "skip_diagonal_blocks": True,
+            "enable": True,
+        },
+        "default": {"enable": False},
+    },
+}
+
 
 __all__ = [
     "SKIP_SOFTMAX_CALIB",
     "SKIP_SOFTMAX_DEFAULT",
+    "SPARSE24_TRITON",
     "CalibrationConfig",
     "FlashSkipSoftmaxConfig",
     "SparseAttentionAttributeConfig",

@@ -32,6 +32,37 @@
 from .utils import get_named_sparse_attention_modules, get_sparse_attention_modules
 
 
+def _register_triton_backend_if_needed(model: nn.Module, config: SparseAttentionConfig) -> None:
+    """Register the Triton attention backend and set attn_implementation if needed.
+
+    When the config uses ``backend="triton"``, this function:
+    1. Registers the Triton kernel with HF's ``ALL_ATTENTION_FUNCTIONS``.
+    2. Sets ``model.config._attn_implementation = "modelopt_triton"`` so the
+       model dispatches to the Triton kernel at forward time.
+
+    This is called automatically during ``mtsa.sparsify()`` so users never need
+    to manually call ``register_triton_attention()`` or set ``attn_implementation``.
+    """
+    sparse_cfg = config.sparse_cfg if hasattr(config, "sparse_cfg") else {}
+    needs_triton = any(
+        isinstance(v, dict) and v.get("backend") == "triton" for v in sparse_cfg.values()
+    )
+    if not needs_triton:
+        return
+
+    from .kernels import register_triton_attention
+
+    if register_triton_attention is not None:
+        register_triton_attention()
+
+    # Set attn_implementation on the model so HF dispatches to the Triton kernel.
+    # HF's ALL_ATTENTION_FUNCTIONS is checked at forward time, not construction time,
+    # so this works even after the model is already loaded.
+    model_config = getattr(model, "config", None)
+    if model_config is not None:
+        model_config._attn_implementation = "modelopt_triton"
+
+
 def is_attn_sparsified(model: nn.Module) -> bool:
     """Check if a model has sparse attention applied.
 
@@ -61,6 +92,9 @@ def convert_to_sparse_attention_model(
     # Initialize the true module if necessary
     model = model.init_modellike() if isinstance(model, ModelLikeModule) else model
 
+    # Register Triton attention backend and set attn_implementation if needed
+    _register_triton_backend_if_needed(model, config)
+
     # Apply custom model plugins
     register_custom_model_plugins_on_the_fly(model)
 

@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Triton attention kernels for sparse attention optimization."""
+
+import torch
+
+from modelopt.torch.utils import import_plugin
+
+IS_AVAILABLE = False
+context_attention_fwd = None
+context_attention = None
+register_triton_attention = None
+set_sparse24 = None
+
+if torch.cuda.is_available():
+    with import_plugin(
+        "triton",
+        msg_if_missing=(
+            "Your device is potentially capable of using the triton attention "
+            "kernel. Try to install triton with `pip install triton`."
+        ),
+    ):
+        from .triton_unified_attention import context_attention as _context_attention
+        from .triton_unified_attention import context_attention_fwd as _context_attention_fwd
+
+        context_attention_fwd = _context_attention_fwd
+        context_attention = _context_attention
+        IS_AVAILABLE = True
+        with import_plugin("transformers"):
+            from .hf_triton_attention import register_triton_attention as _register_triton_attention
+            from .hf_triton_attention import set_sparse24 as _set_sparse24
+
+            register_triton_attention = _register_triton_attention
+            set_sparse24 = _set_sparse24
+            _register_triton_attention()
+
+__all__ = [
+    "IS_AVAILABLE",
+    "context_attention",
+    "context_attention_fwd",
+    "register_triton_attention",
+    "set_sparse24",
+]