NVIDIA · h-guo18 · Feb 21, 2026 · Feb 21, 2026 · Feb 21, 2026
@@ -51,8 +51,10 @@
 import modelopt.torch.sparsity as mts
 from modelopt.torch.export import (
     export_hf_checkpoint,
+    export_speculative_decoding,
     export_tensorrt_llm_checkpoint,
     get_model_type,
+    has_spec_opt,
     save_expert_token_count_table,
 )
 from modelopt.torch.export.model_utils import get_language_model_from_vl, is_multimodal_model
@@ -566,6 +568,13 @@ def export_quantized(
 
         export_path = args.export_path
 
+        # Early exit for speculative decoding checkpoints
+        # No tokenizer saving needed for spec ckpts
+        if has_spec_opt(full_model):
+            export_speculative_decoding(full_model, export_dir=export_path)
+            print(f"Quantized speculative decoding checkpoint exported to: {export_path}")
+            return
+
         # Check if the model is a multimodal/VLM model
         is_vlm = is_multimodal_model(full_model)
 

@@ -20,7 +20,7 @@
 import torch
 
 import modelopt.torch.opt as mto
-from modelopt.torch.export import export_hf_checkpoint
+from modelopt.torch.export import export_speculative_decoding
 from modelopt.torch.speculative.utils import load_vlm_or_llm_with_kwargs
 
 
@@ -41,7 +41,7 @@ def parse_args():
 _, model = load_vlm_or_llm_with_kwargs(args.model_path, torch_dtype="auto")
 model.eval()
 with torch.inference_mode():
-    export_hf_checkpoint(
+    export_speculative_decoding(
         model,
         export_dir=args.export_path,
     )

@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Template config for speculative decoding exporting."""
+
+llama_eagle_template_config = {
+    "architectures": ["LlamaForCausalLMEagle3"],
+    "bos_token_id": None,
+    "eos_token_id": None,
+    "hidden_act": None,
+    "hidden_size": None,
+    "initializer_range": None,
+    "intermediate_size": None,
+    "max_position_embeddings": None,
+    "model_type": "llama",
+    "num_attention_heads": None,
+    "num_key_value_heads": None,
+    "num_hidden_layers": None,
+    "pad_token_id": None,
+    "rms_norm_eps": None,
+    "tie_word_embeddings": False,
+    "torch_dtype": None,
+    "transformers_version": None,
+    "use_cache": None,
+    "vocab_size": None,
+    "draft_vocab_size": None,
+    "rope_scaling": None,
+    "attention_bias": None,
+    "attention_dropout": None,
+    "head_dim": None,
+    "mlp_bias": None,
+    "pretraining_tp": None,
+    "rope_theta": None,
+    "eagle_config": {
+        "eagle_aux_hidden_state_layer_ids": None,
+        "use_aux_hidden_state": None,
+        "use_input_layernorm_in_first_layer": None,
+        "use_last_layernorm": None,
+        "use_mtp_layernorm": None,
+        "next_layer_regular": True,
+        "parallel_draft_step": None,
+        "parallel_draft_heads_num_layers": None,
+    },
+}
+
+kimik2_eagle_template_config = {
+    "architectures": ["Eagle3DeepseekV2ForCausalLM"],
+    "attention_bias": None,
+    "attention_dropout": None,
+    "aux_loss_alpha": None,
+    "bos_token_id": None,
+    "chunk_size_feed_forward": None,
+    "diversity_penalty": None,
+    "do_sample": None,
+    "early_stopping": None,
+    "encoder_no_repeat_ngram_size": None,
+    "eos_token_id": None,
+    "ep_size": None,
+    "first_k_dense_replace": None,
+    "forced_bos_token_id": None,
+    "forced_eos_token_id": None,
+    "hidden_act": None,
+    "hidden_size": None,
+    "id2label": None,
+    "initializer_range": None,
+    "intermediate_size": None,
+    "is_decoder": None,
+    "is_encoder_decoder": None,
+    "kv_lora_rank": None,
+    "label2id": None,
+    "length_penalty": None,
+    "max_length": None,
+    "max_position_embeddings": None,
+    "min_length": None,
+    "model_type": "kimi_k2",
+    "moe_intermediate_size": None,
+    "moe_layer_freq": None,
+    "n_group": None,
+    "n_routed_experts": None,
+    "n_shared_experts": None,
+    "no_repeat_ngram_size": None,
+    "norm_topk_prob": None,
+    "num_attention_heads": None,
+    "num_beam_groups": None,
+    "num_beams": None,
+    "num_experts_per_tok": None,
+    "num_hidden_layers": None,
+    "num_key_value_heads": None,
+    "num_nextn_predict_layers": None,
+    "num_return_sequences": None,
+    "output_attentions": None,
+    "output_hidden_states": None,
+    "output_scores": None,
+    "pad_token_id": None,
+    "pretraining_tp": None,
+    "pruned_heads": None,
+    "q_lora_rank": None,
+    "qk_nope_head_dim": None,
+    "qk_rope_head_dim": None,
+    "remove_invalid_values": None,
+    "repetition_penalty": None,
+    "return_dict": None,
+    "return_dict_in_generate": None,
+    "rms_norm_eps": None,
+    "rope_scaling": None,
+    "rope_theta": None,
+    "routed_scaling_factor": None,
+    "scoring_func": None,
+    "sep_token_id": None,
+    "seq_aux": None,
+    "temperature": None,
+    "tf_legacy_loss": None,
+    "tie_encoder_decoder": None,
+    "tie_word_embeddings": None,
+    "top_k": None,
+    "top_p": None,
+    "topk_group": None,
+    "topk_method": None,
+    "torch_dtype": None,
+    "torchscript": None,
+    "transformers_version": None,
+    "typical_p": None,
+    "use_bfloat16": None,
+    "use_cache": None,
+    "v_head_dim": None,
+    "vocab_size": None,
+    "eagle_config": {
+        "eagle_aux_hidden_state_layer_ids": None,
+        "use_aux_hidden_state": None,
+        "use_input_layernorm_in_first_layer": None,
+        "use_last_layernorm": None,
+        "use_mtp_layernorm": None,
+        "next_layer_regular": True,
+        "parallel_draft_step": None,
+        "parallel_draft_heads_num_layers": None,
+    },
+}