From dc84b6fc5325db1d2c68378ceaec7ba5e92b3034 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 14:35:04 -0800
Subject: [PATCH 01/11] add support for nemotron parse fp8

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py          |  79 +++++++++----
 examples/llm_ptq/hf_ptq.py                 | 116 +++++++++++++-----
 examples/llm_ptq/vlm_utils.py              | 131 +++++++++++++++------
 modelopt/torch/export/model_utils.py       |  12 +-
 modelopt/torch/export/unified_export_hf.py |  84 ++++++++-----
 5 files changed, 302 insertions(+), 120 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 93687a8d0..cfd218abf 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -31,6 +31,7 @@
 from safetensors.torch import load_file
 from transformers import (
     AutoConfig,
+    AutoModel,
     AutoModelForCausalLM,
     AutoProcessor,
     AutoTokenizer,
@@ -67,27 +68,39 @@ def run_nemotron_vl_preview(
     """
     from vlm_utils import run_text_only_generation, run_vl_preview_generation
 
-    print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
-    question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
-    generation_config = {
-        "max_new_tokens": 100,
-        "do_sample": False,
-        "eos_token_id": tokenizer.eos_token_id,
-    }
-
-    # Try text-only generation
-    text_response = run_text_only_generation(
-        full_model, tokenizer, question, generation_config, pyt_ckpt_path
-    )
+    # Check if this is Nemotron-Parse (encoder-decoder model that requires images)
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+    generated_ids = None
+
+    if not is_nemotron_parse:
+        # Only try text-only generation for models that support it (not Nemotron-Parse)
+        print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
+        question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+        generation_config = {
+            "max_new_tokens": 100,
+            "do_sample": False,
+            "eos_token_id": tokenizer.eos_token_id,
+        }
+
+        # Try text-only generation
+        text_response = run_text_only_generation(
+            full_model, tokenizer, question, generation_config, pyt_ckpt_path
+        )
 
-    if text_response is not None:
-        print(f"✅ Text-only generation successful: {text_response[:100]}...")
-        generated_ids = text_response
-    elif allow_fallback:
-        print("Text-only generation failed, falling back to standard generate...")
-        generated_ids = full_model.generate(input_ids, max_new_tokens=100)
+        if text_response is not None:
+            print(f"✅ Text-only generation successful: {text_response[:100]}...")
+            generated_ids = text_response
+        elif allow_fallback:
+            print("Text-only generation failed, falling back to standard generate...")
+            generated_ids = full_model.generate(input_ids, max_new_tokens=100)
     else:
-        generated_ids = None
+        print(
+            f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - "
+            "this encoder-decoder model requires images for all operations."
+        )
 
     # Run additional VL test with images
     print(f"Running additional VL test with images ({stage_name})...")
@@ -98,6 +111,10 @@ def run_nemotron_vl_preview(
 
 def _is_multimodal_config(config):
     """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
+    # Check for Nemotron-Parse encoder-decoder architecture
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
     return (
         hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
         or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
@@ -106,6 +123,7 @@ def _is_multimodal_config(config):
         or (
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
+        or is_nemotron_parse  # Nemotron-Parse conditional generation model
     )
 
 
@@ -312,8 +330,19 @@ def get_processor(
         )
 
         return MllamaImageProcessor(processor, device)
-
-    return None
+    else:
+        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
+        # This will only work if the model has a processor config
+        try:
+            processor = AutoProcessor.from_pretrained(
+                ckpt_path,
+                **model_kwargs,
+            )
+            print(f"Loaded AutoProcessor for model type: {model_type}")
+            return processor
+        except Exception as e:
+            print(f"Could not load processor for {model_type}: {e}")
+            return None
 
 
 def load_mtp_weights(
@@ -466,8 +495,6 @@ def get_model(
         model_kwargs.setdefault("torch_dtype", "auto")
 
     if "vila" in ckpt_path.lower():
-        from transformers import AutoModel
-
         hf_vila = AutoModel.from_pretrained(
             ckpt_path,
             device_map=device_map,
@@ -510,13 +537,13 @@ def get_model(
                 if not hasattr(transformers, architecture):
                     warnings.warn(
                         f"Architecture {architecture} not found in transformers: {transformers.__version__}. "
-                        "Falling back to AutoModelForCausalLM."
+                        "Falling back to AutoModel."
                     )
                 assert trust_remote_code, (
                     "Please set trust_remote_code to True if you want to use this architecture"
                 )
 
-                auto_model_module = AutoModelForCausalLM
+                auto_model_module = AutoModel
                 from_config = auto_model_module.from_config
             else:
                 auto_model_module = getattr(transformers, architecture)
@@ -527,7 +554,7 @@ def get_model(
                 # unless specified by the hf_config.
                 torch_dtype = getattr(hf_config, "torch_dtype", torch.bfloat16)
                 model_kwargs2 = model_kwargs.copy()
-                if auto_model_module != AutoModelForCausalLM:
+                if auto_model_module not in [AutoModelForCausalLM, AutoModel]:
                     model_kwargs2.pop("trust_remote_code", None)
                 model_kwargs2["torch_dtype"] = torch_dtype
                 model_kwargs2.pop("max_memory", None)
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index d9a6ca893..b4be53ecf 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -411,6 +411,18 @@ def load_model(args: argparse.Namespace):
         args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
             : len(args.dataset)
         ]
+
+        # Check if this is a Nemotron VL model that needs a processor
+        is_nemotron_vl_model = is_nemotron_vl(full_model)
+        if is_nemotron_vl_model:
+            # Load processor for Nemotron VL models (like Nemotron-Parse)
+            processor = get_processor(
+                args.pyt_ckpt_path,
+                model_type,
+                device,
+                trust_remote_code=args.trust_remote_code,
+            )
+
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
         default_padding_side = tokenizer.padding_side
@@ -670,10 +682,20 @@ def pre_quantize(
     post-quantize generation.
 
     """
+    # Check if this is Nemotron-Parse (encoder-decoder model)
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
     # Only run single sample for preview
-    preview_input_ids = next(iter(calib_dataloader))[
-        "input_features" if model_type == "whisper" else "input_ids"
-    ][0:1]
+    # For Nemotron-Parse, use decoder_input_ids instead of input_ids
+    sample_batch = next(iter(calib_dataloader))
+    if is_nemotron_parse and "decoder_input_ids" in sample_batch:
+        preview_input_ids = sample_batch["decoder_input_ids"][0:1]
+    elif model_type == "whisper":
+        preview_input_ids = sample_batch["input_features"][0:1]
+    else:
+        preview_input_ids = sample_batch["input_ids"][0:1]
 
     # Generate preview before quantization
     if model_type == "deepseek":
@@ -800,36 +822,46 @@ def quantize_main(
     device: torch.device,
 ):
     if args.batch_size == 0:
-        # Calibration/sparsification will actually take much more memory than regular inference
-        # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
-        # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference.
-        sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1
-        # Whisper model expects mel-spectrogram input features of length 3000
-        # Whisper model needs input of shape (batch_size, num_mel_bins, 3000)
-        # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float
-        # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size()
-        if model_type == "whisper":
-            max_sample_length = 3000
-            num_mel_bins = language_model.config.num_mel_bins
-            sample_input_single_batch = (
-                torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to(
-                    language_model.device
-                )
-                * 100
+        # Check if this is a vision-language model
+        # For VL models, skip automatic batch size detection and use a conservative default
+        # since proper multimodal input preparation is complex
+        if is_multimodal_model(full_model) or is_nemotron_vl(full_model):
+            print(
+                "Vision-language model detected. Using default batch_size=1 for calibration "
+                "to ensure proper handling of multimodal inputs."
             )
+            args.batch_size = 1
         else:
-            sample_input_single_batch = None
+            # Calibration/sparsification will actually take much more memory than regular inference
+            # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio
+            # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference.
+            sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1
+            # Whisper model expects mel-spectrogram input features of length 3000
+            # Whisper model needs input of shape (batch_size, num_mel_bins, 3000)
+            # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float
+            # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size()
+            if model_type == "whisper":
+                max_sample_length = 3000
+                num_mel_bins = language_model.config.num_mel_bins
+                sample_input_single_batch = (
+                    torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to(
+                        language_model.device
+                    )
+                    * 100
+                )
+            else:
+                sample_input_single_batch = None
 
-        run_auto_quant = args.auto_quantize_bits is not None
+            run_auto_quant = args.auto_quantize_bits is not None
 
-        args.batch_size = get_max_batch_size(
-            language_model,
-            max_sample_length=args.calib_seq,
-            sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
-            sample_input_single_batch=sample_input_single_batch,
-            enable_grad=run_auto_quant,
-        )
-        args.batch_size = min(args.batch_size, sum(args.calib_size))
+            args.batch_size = get_max_batch_size(
+                language_model,
+                max_sample_length=args.calib_seq,
+                sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0,
+                sample_input_single_batch=sample_input_single_batch,
+                enable_grad=run_auto_quant,
+            )
+            args.batch_size = min(args.batch_size, sum(args.calib_size))
 
     print(f"Use calib batch_size {args.batch_size}")
 
@@ -840,6 +872,32 @@ def quantize_main(
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
+    # For Nemotron-Parse, wrap the text-only dataloader to add dummy images
+    # Nemotron-Parse is an encoder-decoder model that requires pixel_values
+    if is_nemotron_vl_model and processor is not None:
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        if is_nemotron_parse:
+            # Check if we're quantizing just the decoder or the full model
+            decoder_only = language_model is not full_model
+
+            if decoder_only:
+                print(
+                    "Calibration will use text-only inputs for Nemotron-Parse decoder. "
+                    "Vision encoder is excluded from quantization."
+                )
+            else:
+                print(
+                    "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. "
+                    "Nemotron-Parse requires pixel_values for full model calibration."
+                )
+
+            calib_dataloader = create_nemotron_parse_calib_wrapper(
+                calib_dataloader, processor, device, decoder_only=decoder_only
+            )
+
     preview_input_ids, generated_ids_before_ptq = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
index 6c9d921b8..4789130cd 100644
--- a/examples/llm_ptq/vlm_utils.py
+++ b/examples/llm_ptq/vlm_utils.py
@@ -18,7 +18,7 @@
 import os
 
 from PIL import Image
-from transformers import AutoImageProcessor, AutoProcessor
+from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig
 
 
 def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
@@ -73,13 +73,34 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             print("   Skipping VL preview generation.")
             return None
 
+        # Check if this is Nemotron-Parse early to set up proper generation config
+        config = model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         # Generate response
         question = "Describe this image briefly."  # Updated for single image
-        generation_config = {
-            "max_new_tokens": 50,
-            "do_sample": False,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
+
+        # Use model's GenerationConfig for Nemotron-Parse, dict for others
+        if is_nemotron_parse:
+            try:
+                generation_config = GenerationConfig.from_pretrained(
+                    model_path, trust_remote_code=True
+                )
+                print("Using Nemotron-Parse GenerationConfig from model")
+            except Exception as e:
+                print(f"Warning: Could not load GenerationConfig: {e}, using defaults")
+                generation_config = {
+                    "max_new_tokens": 50,
+                    "do_sample": False,
+                    "eos_token_id": tokenizer.eos_token_id,
+                }
+        else:
+            generation_config = {
+                "max_new_tokens": 50,
+                "do_sample": False,
+                "eos_token_id": tokenizer.eos_token_id,
+            }
 
         print(f"Generating VL response ({stage_name})...")
 
@@ -105,27 +126,39 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
         else:
             processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
-            messages = [
-                {"role": "system", "content": "/no_think"},
-                {
-                    "role": "user",
-                    "content": [
-                        {
-                            "type": "image",
-                            "image": "",
-                        },
-                        {
-                            "type": "text",
-                            "text": question,
-                        },
-                    ],
-                },
-            ]
+            # Check if this is Nemotron-Parse (uses task prompts instead of chat templates)
+            config = model.config
+            architectures = getattr(config, "architectures", [])
+            is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
 
-            # Apply chat template
-            prompt = tokenizer.apply_chat_template(
-                messages, tokenize=False, add_generation_prompt=True
-            )
+            if is_nemotron_parse:
+                # Nemotron-Parse uses a specific task prompt format
+                # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example
+                prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
+                print(f"Using Nemotron-Parse task prompt: {prompt}")
+            else:
+                # Other VL models use chat templates
+                messages = [
+                    {"role": "system", "content": "/no_think"},
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "image": "",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    },
+                ]
+
+                # Apply chat template
+                prompt = tokenizer.apply_chat_template(
+                    messages, tokenize=False, add_generation_prompt=True
+                )
 
             # Process inputs using the processor with single image
             inputs = processor(
@@ -139,21 +172,49 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             inputs = inputs.to(model_device)
             print(f"    Moved inputs to {model_device}")
 
+            # Verify we have pixel_values for the vision encoder
+            if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None:
+                raise ValueError("Processor did not generate pixel_values. Check processor configuration.")
+
             # Generate response using model.generate
-            generated_ids = model.generate(
-                pixel_values=inputs.pixel_values,
-                input_ids=inputs.input_ids,
-                attention_mask=inputs.attention_mask,
-                **generation_config,
-            )
+            if isinstance(generation_config, GenerationConfig):
+                # For Nemotron-Parse with GenerationConfig object
+                generated_ids = model.generate(
+                    pixel_values=inputs.pixel_values,
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    generation_config=generation_config,
+                )
+            else:
+                # For other models with dict generation config
+                generated_ids = model.generate(
+                    pixel_values=inputs.pixel_values,
+                    input_ids=inputs.input_ids,
+                    attention_mask=inputs.attention_mask,
+                    **generation_config,
+                )
 
             # Decode the response (trim input tokens like in the working example)
+            if generated_ids is None:
+                raise ValueError("Model generate returned None")
+
             generated_ids_trimmed = [
                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
             ]
-            output_text = processor.batch_decode(
-                generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
-            )
+
+            # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
+            if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'):
+                output_text = tokenizer.batch_decode(
+                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                )
+            else:
+                output_text = processor.batch_decode(
+                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                )
+
+            if output_text is None or len(output_text) == 0:
+                raise ValueError("Decoding returned empty output")
+
             response = output_text[0]
 
         print(f"✅ VL generation {stage_name} successful!")
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 5a24429ad..40c313ad2 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -85,6 +85,7 @@ def is_multimodal_model(model):
     - Vision LoRA configurations
     - Audio processing capabilities
     - Image embedding layers
+    - Nemotron-Parse conditional generation models
 
     Args:
         model: The HuggingFace model instance to check
@@ -103,6 +104,10 @@ def is_multimodal_model(model):
     """
     config = model.config
 
+    # Check for Nemotron-Parse encoder-decoder architecture
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
     return (
         hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
         or hasattr(model, "language_model")  # Language model attribute (e.g., LLaVA)
@@ -112,6 +117,7 @@ def is_multimodal_model(model):
         or (
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
+        or is_nemotron_parse  # Nemotron-Parse conditional generation model
     )
 
 
@@ -141,5 +147,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
     if hasattr(model, "language_model"):
         return [model, model.language_model]
 
-    # Pattern 3: No language_model found
+    # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model
+    if hasattr(model, "decoder"):
+        return [model, model.decoder]
+
+    # Pattern 4: No language_model found
     return None
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 5703f4515..f906c6797 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -148,13 +148,13 @@ def _collect_shared_input_modules(
     def _input_hook(module, input, output):
         """Update dictionary with list of all modules that share the same input."""
         if len(input) > 0 and isinstance(input[0], torch.Tensor):
-            # TODO: Handle DBRX MoE case
-            input_to_linear[input[0]].append(module)
+        # TODO: Handle DBRX MoE case
+        input_to_linear[input[0]].append(module)
 
     def _output_hook(module, input, output):
         """Update dictionary with mapping of layernorms and their outputs."""
         if output_to_layernorm is not None and isinstance(output, torch.Tensor):
-            output_to_layernorm[output] = module
+        output_to_layernorm[output] = module
 
     handles = []
 
@@ -316,29 +316,36 @@ def llm_dummy_forward():
                 [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype
             ).to(model.device)
 
-        if getattr(model.config, "is_encoder_decoder", False):
-            # For encoder-decoder models, we need to pass both the encoder and decoder input ids
+        # Check if this is Nemotron-Parse (encoder-decoder VL model)
+        architectures = getattr(model.config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
+        if is_vl_model and ("nemotron" in model_type or is_nemotron_parse):
+            # For Nemotron VL models (including Nemotron-Parse), run optimization on just the
+            # language model/decoder. This avoids needing pixel_values for the vision encoder.
+                language_model_lineage = get_language_model_from_vl(model)
+
+                if language_model_lineage is not None:
+                    language_model = language_model_lineage[-1]
+                    print(
+                        f"Running optimization on language model with fake_input shape: {fake_input.shape}"
+                    )
+                # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
+                if is_nemotron_parse:
+                    language_model(fake_input, use_cache=False)
+                else:
+                    language_model(fake_input)
+                else:
+                    raise ValueError(
+                        f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
+                        "This is required for requantization/resmoothing optimization. "
+                        "Please ensure the model architecture is supported or file an issue."
+                    )
+        elif getattr(model.config, "is_encoder_decoder", False):
+            # For other encoder-decoder models (non-VL), pass both encoder and decoder input ids
             model(fake_input, decoder_input_ids=decoder_fake_input)
-        elif is_vl_model and "nemotron" in model_type:
-            # For Nemotron VL models, try to run optimization on just the language model part
-            language_model_lineage = get_language_model_from_vl(model)
-
-            if language_model_lineage is not None:
-                # Run optimization on just the language model with the same input format as regular LLMs
-                # Use the same fake_input tensor that regular LLMs use
-                language_model = language_model_lineage[-1]
-                print(
-                    f"Running optimization on language model with fake_input shape: {fake_input.shape}"
-                )
-                language_model(fake_input)
             else:
-                raise ValueError(
-                    f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
-                    "This is required for requantization/resmoothing optimization. "
-                    "Please ensure the model architecture is supported or file an issue."
-                )
-        else:
-            model(fake_input)
+                model(fake_input)
 
     input_to_linear, output_to_layernorm = _collect_shared_input_modules(
         model, llm_dummy_forward, collect_layernorms=True
@@ -405,9 +412,19 @@ def _export_quantized_weight(
 
     if quantization_format == QUANTIZATION_FP8:
         # Convert amax to float32
+        # Note: Use the public 'amax' property, not the private '_amax' attribute
+        if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None:
         weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
-
-        if weight_quantizer._amax.dim() == 1:
+            amax_tensor = weight_quantizer._amax
+        else:
+            # Fallback to public amax property
+            amax_tensor = weight_quantizer.amax
+            if amax_tensor is not None and hasattr(amax_tensor, 'to'):
+                amax_tensor = amax_tensor.to(torch.float32)
+
+        # Only compute scaling factor if amax_tensor is valid
+        if amax_tensor is not None and hasattr(amax_tensor, 'dim'):
+            if amax_tensor.dim() == 1:
             # Per-tensor amax
             weight_scaling_factor = torch.tensor(
                 weight_quantizer.amax.item() / weight_quantizer.maxbound
@@ -421,8 +438,9 @@ def _export_quantized_weight(
             weight_scaling_factor,
         )
 
-        if hasattr(input_quantizer, "_amax"):
+        if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None):
             assert input_quantizer is not None
+            if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
             input_quantizer._amax = input_quantizer._amax.to(torch.float32)
 
             sub_module.register_buffer(
@@ -432,8 +450,9 @@ def _export_quantized_weight(
                 ).squeeze(),
             )
 
-        if hasattr(output_quantizer, "_amax"):
+        if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None):
             assert output_quantizer is not None
+            if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
             output_quantizer._amax = output_quantizer._amax.to(torch.float32)
     else:
         # Register weight_scale and input_scale
@@ -451,7 +470,7 @@ def _export_quantized_weight(
             )
             sub_module.register_buffer(quantizer_attrs.weight_scale, e8m0_scale)
             if hasattr(weight_quantizer, "_scale") and weight_quantizer._scale is not None:
-                del weight_quantizer._scale
+            del weight_quantizer._scale
         else:
             sub_module.register_buffer(
                 quantizer_attrs.weight_scale, get_weight_scaling_factor(sub_module, weight_name)
@@ -485,6 +504,13 @@ def _export_quantized_weight(
     weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None)
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
+    # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module
+    # This can happen for modules that were disabled from quantization or have invalid calibration data
+    if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+        # For NVFP4, weight_scale is computed later, so we can't check here
+        print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found")
+        return
+
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
     # Check if this is a BMM-style expert weight that needs transposition
     is_bmm_expert_weight = weight.dim() == 3 and any(

From 8ee2778d88b7ca4517268286dbb9df7a26d9b862 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 14:36:03 -0800
Subject: [PATCH 02/11] add support for nemotron parse fp8

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/vlm_utils.py              | 16 +++++---
 modelopt/torch/export/unified_export_hf.py | 44 +++++++++++++++-------
 2 files changed, 41 insertions(+), 19 deletions(-)

diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
index 4789130cd..2d3d9f82c 100644
--- a/examples/llm_ptq/vlm_utils.py
+++ b/examples/llm_ptq/vlm_utils.py
@@ -173,8 +173,10 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             print(f"    Moved inputs to {model_device}")
 
             # Verify we have pixel_values for the vision encoder
-            if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None:
-                raise ValueError("Processor did not generate pixel_values. Check processor configuration.")
+            if not hasattr(inputs, "pixel_values") or inputs.pixel_values is None:
+                raise ValueError(
+                    "Processor did not generate pixel_values. Check processor configuration."
+                )
 
             # Generate response using model.generate
             if isinstance(generation_config, GenerationConfig):
@@ -203,13 +205,17 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             ]
 
             # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
-            if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'):
+            if is_nemotron_parse and hasattr(tokenizer, "batch_decode"):
                 output_text = tokenizer.batch_decode(
-                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                    generated_ids_trimmed,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False,
                 )
             else:
                 output_text = processor.batch_decode(
-                    generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+                    generated_ids_trimmed,
+                    skip_special_tokens=True,
+                    clean_up_tokenization_spaces=False,
                 )
 
             if output_text is None or len(output_text) == 0:
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index f906c6797..6efcb1e6a 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -413,32 +413,41 @@ def _export_quantized_weight(
     if quantization_format == QUANTIZATION_FP8:
         # Convert amax to float32
         # Note: Use the public 'amax' property, not the private '_amax' attribute
-        if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None:
-        weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
+        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
+            weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
             amax_tensor = weight_quantizer._amax
         else:
             # Fallback to public amax property
             amax_tensor = weight_quantizer.amax
-            if amax_tensor is not None and hasattr(amax_tensor, 'to'):
+            if amax_tensor is not None and hasattr(amax_tensor, "to"):
                 amax_tensor = amax_tensor.to(torch.float32)
 
         # Only compute scaling factor if amax_tensor is valid
-        if amax_tensor is not None and hasattr(amax_tensor, 'dim'):
+        if amax_tensor is not None and hasattr(amax_tensor, "dim"):
             if amax_tensor.dim() == 1:
-            # Per-tensor amax
-            weight_scaling_factor = torch.tensor(
-                weight_quantizer.amax.item() / weight_quantizer.maxbound
+                # Per-tensor amax
+                weight_scaling_factor = torch.tensor(
+                    weight_quantizer.amax.item() / weight_quantizer.maxbound
+                )
+            else:
+                # Per-channel amax
+                weight_scaling_factor = torch.tensor(
+                    weight_quantizer.amax / weight_quantizer.maxbound
+                )
+
+            sub_module.register_buffer(
+                quantizer_attrs.weight_scale,
+                weight_scaling_factor,
             )
-        else:
-            # Per-channel amax
-            weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound)
 
         sub_module.register_buffer(
             quantizer_attrs.weight_scale,
             weight_scaling_factor,
         )
 
-        if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None):
+        if hasattr(input_quantizer, "_amax") or (
+            hasattr(input_quantizer, "amax") and input_quantizer.amax is not None
+        ):
             assert input_quantizer is not None
             if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
             input_quantizer._amax = input_quantizer._amax.to(torch.float32)
@@ -450,7 +459,9 @@ def _export_quantized_weight(
                 ).squeeze(),
             )
 
-        if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None):
+        if hasattr(output_quantizer, "_amax") or (
+            hasattr(output_quantizer, "amax") and output_quantizer.amax is not None
+        ):
             assert output_quantizer is not None
             if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
             output_quantizer._amax = output_quantizer._amax.to(torch.float32)
@@ -506,9 +517,14 @@ def _export_quantized_weight(
 
     # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module
     # This can happen for modules that were disabled from quantization or have invalid calibration data
-    if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]:
+    if weight_scale is None and quantization_format not in [
+        QUANTIZATION_NVFP4,
+        QUANTIZATION_NVFP4_AWQ,
+    ]:
         # For NVFP4, weight_scale is computed later, so we can't check here
-        print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found")
+        print(
+            f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found"
+        )
         return
 
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)

From 2f2203cbe24ad9497df7976d97699553c3ddfa60 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 14 Jan 2026 14:38:52 -0800
Subject: [PATCH 03/11] add support for nemotron parse fp8

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/unified_export_hf.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 6efcb1e6a..eb33ee8fe 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -446,7 +446,9 @@ def _export_quantized_weight(
         )
 
         if hasattr(input_quantizer, "_amax") or (
-            hasattr(input_quantizer, "amax") and input_quantizer.amax is not None
+            input_quantizer is not None
+            and hasattr(input_quantizer, "amax")
+            and input_quantizer.amax is not None
         ):
             assert input_quantizer is not None
             if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
@@ -460,7 +462,9 @@ def _export_quantized_weight(
             )
 
         if hasattr(output_quantizer, "_amax") or (
-            hasattr(output_quantizer, "amax") and output_quantizer.amax is not None
+            output_quantizer is not None
+            and hasattr(output_quantizer, "amax")
+            and output_quantizer.amax is not None
         ):
             assert output_quantizer is not None
             if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:

From 7ea5e0e870242696042b50d39088d7006e0b7586 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 20 Jan 2026 17:42:11 -0800
Subject: [PATCH 04/11] add image-text data calibration support

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py          | 90 +++++++++++++++++++---
 examples/llm_ptq/hf_ptq.py                 | 81 ++++++++++---------
 modelopt/torch/export/unified_export_hf.py | 10 +--
 3 files changed, 128 insertions(+), 53 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index cfd218abf..3fb6f1ceb 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -276,9 +276,33 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
     if "vila" in ckpt_path.lower():
         ckpt_path += "/llm"
 
-    tokenizer = AutoTokenizer.from_pretrained(
-        ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-    )
+    # Suppress verbose tokenizer output (e.g., printing all special tokens)
+    import contextlib
+    import io
+    import logging
+    import os
+
+    # Save current settings
+    old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None)
+    transformers_log_level = logging.getLogger("transformers").level
+
+    # Suppress output
+    os.environ["TOKENIZERS_PARALLELISM"] = "false"
+    logging.getLogger("transformers").setLevel(logging.ERROR)
+
+    # Also capture stdout to suppress verbose tokenizer printing
+    with contextlib.redirect_stdout(io.StringIO()):
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                ckpt_path, trust_remote_code=trust_remote_code, **kwargs
+            )
+        finally:
+            # Restore original settings
+            if old_verbosity is not None:
+                os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity
+            else:
+                os.environ.pop("TOKENIZERS_PARALLELISM", None)
+            logging.getLogger("transformers").setLevel(transformers_log_level)
 
     # can't set attribute 'pad_token' for "<unk>"
     # We skip this step for Nemo models
@@ -334,10 +358,23 @@ def get_processor(
         # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         # This will only work if the model has a processor config
         try:
-            processor = AutoProcessor.from_pretrained(
-                ckpt_path,
-                **model_kwargs,
-            )
+            import contextlib
+            import io
+            import logging
+
+            # Suppress verbose output from processor/tokenizer loading
+            transformers_log_level = logging.getLogger("transformers").level
+            logging.getLogger("transformers").setLevel(logging.ERROR)
+
+            with contextlib.redirect_stdout(io.StringIO()):
+                processor = AutoProcessor.from_pretrained(
+                    ckpt_path,
+                    **model_kwargs,
+                )
+
+            # Restore logging
+            logging.getLogger("transformers").setLevel(transformers_log_level)
+
             print(f"Loaded AutoProcessor for model type: {model_type}")
             return processor
         except Exception as e:
@@ -476,12 +513,26 @@ def get_model(
     # Load config once and handle VL model detection
     try:
         hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
+
+        # Check specifically for Nemotron-Parse
+        architectures = getattr(hf_config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         if is_nemotron_vl(hf_config):
-            print(
-                "Detected Nemotron VL model from config. "
-                "Disabling automatic device mapping for compatibility."
-            )
-            device_map = None
+            if is_nemotron_parse:
+                # Nemotron-Parse works fine with device_map="auto"
+                # Keep device_map="auto" to ensure proper device placement
+                print(
+                    "Detected Nemotron-Parse model from config. "
+                    "Using automatic device mapping."
+                )
+            else:
+                # For other Nemotron VL models, disable device_map for compatibility
+                print(
+                    "Detected Nemotron VL model from config. "
+                    "Disabling automatic device mapping for compatibility."
+                )
+                device_map = None
     except Exception as e:
         print(f"Error: Could not load config from {ckpt_path}: {e}")
         raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
@@ -590,6 +641,21 @@ def get_model(
         print(f"Moving model to {device} device...")
         model = model.to(device)
 
+    # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
+    # The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
+    # This is because custom RADIO modules might not fully support accelerate's device_map
+    if device != "cpu" and hasattr(model, "encoder"):
+        # Check if encoder has any buffers on CPU
+        cpu_buffers = []
+        for name, buffer in model.encoder.named_buffers():
+            if buffer.device.type == "cpu":
+                cpu_buffers.append(name)
+
+        if cpu_buffers:
+            print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...")
+            model.encoder = model.encoder.to(device)
+            print(f"Encoder moved to {device}")
+
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index b4be53ecf..ad552ee43 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -66,6 +66,7 @@
 )
 from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
+from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
@@ -141,6 +142,7 @@ def make_calib_dataloader(
     tokenizer: PreTrainedTokenizerBase | None,
     device: torch.device,
     model_type: str | None,
+    full_model: torch.nn.Module | None = None,
 ) -> tuple[DataLoader, str | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
@@ -402,18 +404,35 @@ def load_model(args: argparse.Namespace):
             language_model = extracted_lm
             model_type = extracted_model_type
     else:
+        # Check if this is a Nemotron VL model that needs a processor
+        # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse
+        is_nemotron_vl_model = is_nemotron_vl(full_model)
+
+        # Check specifically for Nemotron-Parse to set appropriate dataset defaults
+        config = full_model.config
+        architectures = getattr(config, "architectures", [])
+        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+
         if args.dataset is None:
-            args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
-            warnings.warn(
-                "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
-            )
+            if is_nemotron_parse:
+                # For Nemotron-Parse, default to Nemotron VLM Dataset v2
+                args.dataset = ["nemotron_vlm_v2"]
+                print(
+                    "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse "
+                    "(NVIDIA's image-text dataset for better calibration)."
+                )
+            else:
+                # For other models, use text-only datasets
+                args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+                warnings.warn(
+                    "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
+                )
+
         # Adjust calib_size to match dataset length by extending or truncating as needed
         args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
             : len(args.dataset)
         ]
 
-        # Check if this is a Nemotron VL model that needs a processor
-        is_nemotron_vl_model = is_nemotron_vl(full_model)
         if is_nemotron_vl_model:
             # Load processor for Nemotron VL models (like Nemotron-Parse)
             processor = get_processor(
@@ -506,14 +525,23 @@ def mono_quantize(
             "Consider reducing calib_size to reduce calibration time.\n####\n"
         )
 
+    # Check if this is Nemotron-Parse
+    config = full_model.config
+    architectures = getattr(config, "architectures", [])
+    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+    original_forward = None  # Track original forward method if we wrap it
+
     # For Nemotron VL models, disable quantization of vision components
     if is_nemotron_vl_model:
         print("Disabling quantization for vision components in Nemotron VL model")
         quant_cfg["quant_cfg"]["*vision*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*image*"] = {"enable": False}
-        # Also disable radio model components specifically
+        # Also disable radio model components specifically (for Nemotron-Parse)
         quant_cfg["quant_cfg"]["*radio*"] = {"enable": False}
         quant_cfg["quant_cfg"]["*visual*"] = {"enable": False}
+        quant_cfg["quant_cfg"]["*encoder*"] = {"enable": False}  # Disable encoder
+        quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False}  # Nemotron-Parse specific
+        print("Quantization will only be applied to the decoder (text generation) component")
 
     if not model_is_already_quantized or calibration_only:
         if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only":
@@ -541,8 +569,15 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
-        # For VL models, update full_model to use the quantized language model
-        if is_nemotron_vl_model:
+        # Restore original forward method if we wrapped it for Nemotron-Parse
+        if is_nemotron_parse and original_forward is not None:
+            print("Restoring original forward method after calibration")
+            language_model.forward = original_forward
+            original_forward = None
+
+        # For VL models (except Nemotron-Parse), update full_model to use the quantized language model
+        # For Nemotron-Parse, language_model IS full_model, so no update needed
+        if is_nemotron_vl_model and language_model is not full_model:
             language_model_lineage = get_language_model_from_vl(full_model)
             if language_model_lineage is not None:
                 print("Updating full_model with quantized language_model...")
@@ -866,38 +901,12 @@ def quantize_main(
     print(f"Use calib batch_size {args.batch_size}")
 
     calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
-        args, language_model, processor, tokenizer, device, model_type
+        args, language_model, processor, tokenizer, device, model_type, full_model
     )
 
     # Detect if this is a Nemotron VL model using architecture-based detection
     is_nemotron_vl_model = is_nemotron_vl(full_model)
 
-    # For Nemotron-Parse, wrap the text-only dataloader to add dummy images
-    # Nemotron-Parse is an encoder-decoder model that requires pixel_values
-    if is_nemotron_vl_model and processor is not None:
-        config = full_model.config
-        architectures = getattr(config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
-        if is_nemotron_parse:
-            # Check if we're quantizing just the decoder or the full model
-            decoder_only = language_model is not full_model
-
-            if decoder_only:
-                print(
-                    "Calibration will use text-only inputs for Nemotron-Parse decoder. "
-                    "Vision encoder is excluded from quantization."
-                )
-            else:
-                print(
-                    "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. "
-                    "Nemotron-Parse requires pixel_values for full model calibration."
-                )
-
-            calib_dataloader = create_nemotron_parse_calib_wrapper(
-                calib_dataloader, processor, device, decoder_only=decoder_only
-            )
-
     preview_input_ids, generated_ids_before_ptq = pre_quantize(
         args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model
     )
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index eb33ee8fe..424c5f8e9 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -330,11 +330,11 @@ def llm_dummy_forward():
                     print(
                         f"Running optimization on language model with fake_input shape: {fake_input.shape}"
                     )
-                # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
-                if is_nemotron_parse:
-                    language_model(fake_input, use_cache=False)
-                else:
-                    language_model(fake_input)
+                    # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
+                    if is_nemotron_parse:
+                        language_model(fake_input, use_cache=False)
+                    else:
+                        language_model(fake_input)
                 else:
                     raise ValueError(
                         f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "

From d1963559a05813d3dafec92f0269b82a4011b33d Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 20 Jan 2026 17:42:46 -0800
Subject: [PATCH 05/11] add image-text data calibration support

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 3fb6f1ceb..23ff97f56 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -522,10 +522,7 @@ def get_model(
             if is_nemotron_parse:
                 # Nemotron-Parse works fine with device_map="auto"
                 # Keep device_map="auto" to ensure proper device placement
-                print(
-                    "Detected Nemotron-Parse model from config. "
-                    "Using automatic device mapping."
-                )
+                print("Detected Nemotron-Parse model from config. Using automatic device mapping.")
             else:
                 # For other Nemotron VL models, disable device_map for compatibility
                 print(

From dc1af904cd87e98d90efbfb1e69abaf2bd5d8109 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Mon, 9 Feb 2026 22:28:42 -0800
Subject: [PATCH 06/11] fix issues caused by rebase and simplify

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py                 | 37 +++------------
 modelopt/torch/export/unified_export_hf.py | 53 ++++++++++------------
 2 files changed, 30 insertions(+), 60 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index ad552ee43..0912c5d83 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -66,7 +66,6 @@
 )
 from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor
 from modelopt.torch.utils.memory_monitor import launch_memory_monitor
-from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader
 from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader
 from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader
 
@@ -142,7 +141,6 @@ def make_calib_dataloader(
     tokenizer: PreTrainedTokenizerBase | None,
     device: torch.device,
     model_type: str | None,
-    full_model: torch.nn.Module | None = None,
 ) -> tuple[DataLoader, str | None]:
     calib_dataloader = None
     first_text_speech_dataset = None
@@ -525,12 +523,6 @@ def mono_quantize(
             "Consider reducing calib_size to reduce calibration time.\n####\n"
         )
 
-    # Check if this is Nemotron-Parse
-    config = full_model.config
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-    original_forward = None  # Track original forward method if we wrap it
-
     # For Nemotron VL models, disable quantization of vision components
     if is_nemotron_vl_model:
         print("Disabling quantization for vision components in Nemotron VL model")
@@ -569,15 +561,8 @@ def mono_quantize(
         else:
             language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop)
 
-        # Restore original forward method if we wrapped it for Nemotron-Parse
-        if is_nemotron_parse and original_forward is not None:
-            print("Restoring original forward method after calibration")
-            language_model.forward = original_forward
-            original_forward = None
-
-        # For VL models (except Nemotron-Parse), update full_model to use the quantized language model
-        # For Nemotron-Parse, language_model IS full_model, so no update needed
-        if is_nemotron_vl_model and language_model is not full_model:
+        # For VL models, update full_model to use the quantized language model
+        if is_nemotron_vl_model:
             language_model_lineage = get_language_model_from_vl(full_model)
             if language_model_lineage is not None:
                 print("Updating full_model with quantized language_model...")
@@ -717,20 +702,10 @@ def pre_quantize(
     post-quantize generation.
 
     """
-    # Check if this is Nemotron-Parse (encoder-decoder model)
-    config = full_model.config
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
     # Only run single sample for preview
-    # For Nemotron-Parse, use decoder_input_ids instead of input_ids
-    sample_batch = next(iter(calib_dataloader))
-    if is_nemotron_parse and "decoder_input_ids" in sample_batch:
-        preview_input_ids = sample_batch["decoder_input_ids"][0:1]
-    elif model_type == "whisper":
-        preview_input_ids = sample_batch["input_features"][0:1]
-    else:
-        preview_input_ids = sample_batch["input_ids"][0:1]
+    preview_input_ids = next(iter(calib_dataloader))[
+        "input_features" if model_type == "whisper" else "input_ids"
+    ][0:1]
 
     # Generate preview before quantization
     if model_type == "deepseek":
@@ -901,7 +876,7 @@ def quantize_main(
     print(f"Use calib batch_size {args.batch_size}")
 
     calib_dataloader, first_text_speech_dataset = make_calib_dataloader(
-        args, language_model, processor, tokenizer, device, model_type, full_model
+        args, language_model, processor, tokenizer, device, model_type
     )
 
     # Detect if this is a Nemotron VL model using architecture-based detection
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 424c5f8e9..878970dd3 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -148,13 +148,13 @@ def _collect_shared_input_modules(
     def _input_hook(module, input, output):
         """Update dictionary with list of all modules that share the same input."""
         if len(input) > 0 and isinstance(input[0], torch.Tensor):
-        # TODO: Handle DBRX MoE case
-        input_to_linear[input[0]].append(module)
+            # TODO: Handle DBRX MoE case
+            input_to_linear[input[0]].append(module)
 
     def _output_hook(module, input, output):
         """Update dictionary with mapping of layernorms and their outputs."""
         if output_to_layernorm is not None and isinstance(output, torch.Tensor):
-        output_to_layernorm[output] = module
+            output_to_layernorm[output] = module
 
     handles = []
 
@@ -323,29 +323,29 @@ def llm_dummy_forward():
         if is_vl_model and ("nemotron" in model_type or is_nemotron_parse):
             # For Nemotron VL models (including Nemotron-Parse), run optimization on just the
             # language model/decoder. This avoids needing pixel_values for the vision encoder.
-                language_model_lineage = get_language_model_from_vl(model)
+            language_model_lineage = get_language_model_from_vl(model)
 
-                if language_model_lineage is not None:
-                    language_model = language_model_lineage[-1]
-                    print(
-                        f"Running optimization on language model with fake_input shape: {fake_input.shape}"
-                    )
-                    # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
-                    if is_nemotron_parse:
-                        language_model(fake_input, use_cache=False)
-                    else:
-                        language_model(fake_input)
+            if language_model_lineage is not None:
+                language_model = language_model_lineage[-1]
+                print(
+                    f"Running optimization on language model with fake_input shape: {fake_input.shape}"
+                )
+                # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
+                if is_nemotron_parse:
+                    language_model(fake_input, use_cache=False)
                 else:
-                    raise ValueError(
-                        f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
-                        "This is required for requantization/resmoothing optimization. "
-                        "Please ensure the model architecture is supported or file an issue."
-                    )
+                    language_model(fake_input)
+            else:
+                raise ValueError(
+                    f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
+                    "This is required for requantization/resmoothing optimization. "
+                    "Please ensure the model architecture is supported or file an issue."
+                )
         elif getattr(model.config, "is_encoder_decoder", False):
             # For other encoder-decoder models (non-VL), pass both encoder and decoder input ids
             model(fake_input, decoder_input_ids=decoder_fake_input)
-            else:
-                model(fake_input)
+        else:
+            model(fake_input)
 
     input_to_linear, output_to_layernorm = _collect_shared_input_modules(
         model, llm_dummy_forward, collect_layernorms=True
@@ -440,11 +440,6 @@ def _export_quantized_weight(
                 weight_scaling_factor,
             )
 
-        sub_module.register_buffer(
-            quantizer_attrs.weight_scale,
-            weight_scaling_factor,
-        )
-
         if hasattr(input_quantizer, "_amax") or (
             input_quantizer is not None
             and hasattr(input_quantizer, "amax")
@@ -452,7 +447,7 @@ def _export_quantized_weight(
         ):
             assert input_quantizer is not None
             if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
-            input_quantizer._amax = input_quantizer._amax.to(torch.float32)
+                input_quantizer._amax = input_quantizer._amax.to(torch.float32)
 
             sub_module.register_buffer(
                 quantizer_attrs.input_scale,
@@ -468,7 +463,7 @@ def _export_quantized_weight(
         ):
             assert output_quantizer is not None
             if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
-            output_quantizer._amax = output_quantizer._amax.to(torch.float32)
+                output_quantizer._amax = output_quantizer._amax.to(torch.float32)
     else:
         # Register weight_scale and input_scale
         if quantization_format == QUANTIZATION_FP8_PB_REAL:
@@ -485,7 +480,7 @@ def _export_quantized_weight(
             )
             sub_module.register_buffer(quantizer_attrs.weight_scale, e8m0_scale)
             if hasattr(weight_quantizer, "_scale") and weight_quantizer._scale is not None:
-            del weight_quantizer._scale
+                del weight_quantizer._scale
         else:
             sub_module.register_buffer(
                 quantizer_attrs.weight_scale, get_weight_scaling_factor(sub_module, weight_name)

From e0e28cb3887f07d75a484722d29402ab2214c670 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 10 Feb 2026 00:02:05 -0800
Subject: [PATCH 07/11] clean up

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py    | 63 ++++++++++------------------
 examples/llm_ptq/hf_ptq.py           | 37 ++--------------
 examples/llm_ptq/vlm_utils.py        |  6 +--
 modelopt/torch/export/model_utils.py |  4 +-
 4 files changed, 29 insertions(+), 81 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index 23ff97f56..e4e7fe1b9 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -276,33 +276,20 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
     if "vila" in ckpt_path.lower():
         ckpt_path += "/llm"
 
-    # Suppress verbose tokenizer output (e.g., printing all special tokens)
-    import contextlib
-    import io
-    import logging
-    import os
-
-    # Save current settings
-    old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None)
-    transformers_log_level = logging.getLogger("transformers").level
-
-    # Suppress output
-    os.environ["TOKENIZERS_PARALLELISM"] = "false"
-    logging.getLogger("transformers").setLevel(logging.ERROR)
-
-    # Also capture stdout to suppress verbose tokenizer printing
-    with contextlib.redirect_stdout(io.StringIO()):
-        try:
+    # Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading.
+    # Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy.
+    if trust_remote_code:
+        import contextlib
+        import io
+
+        with contextlib.redirect_stdout(io.StringIO()):
             tokenizer = AutoTokenizer.from_pretrained(
                 ckpt_path, trust_remote_code=trust_remote_code, **kwargs
             )
-        finally:
-            # Restore original settings
-            if old_verbosity is not None:
-                os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity
-            else:
-                os.environ.pop("TOKENIZERS_PARALLELISM", None)
-            logging.getLogger("transformers").setLevel(transformers_log_level)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(
+            ckpt_path, trust_remote_code=trust_remote_code, **kwargs
+        )
 
     # can't set attribute 'pad_token' for "<unk>"
     # We skip this step for Nemo models
@@ -355,25 +342,17 @@ def get_processor(
 
         return MllamaImageProcessor(processor, device)
     else:
-        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
-        # This will only work if the model has a processor config
-        try:
-            import contextlib
-            import io
-            import logging
-
-            # Suppress verbose output from processor/tokenizer loading
-            transformers_log_level = logging.getLogger("transformers").level
-            logging.getLogger("transformers").setLevel(logging.ERROR)
-
-            with contextlib.redirect_stdout(io.StringIO()):
-                processor = AutoProcessor.from_pretrained(
-                    ckpt_path,
-                    **model_kwargs,
-                )
+        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse).
+        # Suppress stdout for trust_remote_code models where custom processor code may be noisy.
+        import contextlib
+        import io
 
-            # Restore logging
-            logging.getLogger("transformers").setLevel(transformers_log_level)
+        try:
+            if model_kwargs.get("trust_remote_code", False):
+                with contextlib.redirect_stdout(io.StringIO()):
+                    processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
+            else:
+                processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
 
             print(f"Loaded AutoProcessor for model type: {model_type}")
             return processor
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 0912c5d83..0a414e408 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -402,44 +402,15 @@ def load_model(args: argparse.Namespace):
             language_model = extracted_lm
             model_type = extracted_model_type
     else:
-        # Check if this is a Nemotron VL model that needs a processor
-        # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse
-        is_nemotron_vl_model = is_nemotron_vl(full_model)
-
-        # Check specifically for Nemotron-Parse to set appropriate dataset defaults
-        config = full_model.config
-        architectures = getattr(config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
         if args.dataset is None:
-            if is_nemotron_parse:
-                # For Nemotron-Parse, default to Nemotron VLM Dataset v2
-                args.dataset = ["nemotron_vlm_v2"]
-                print(
-                    "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse "
-                    "(NVIDIA's image-text dataset for better calibration)."
-                )
-            else:
-                # For other models, use text-only datasets
-                args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
-                warnings.warn(
-                    "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
-                )
-
+            args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"]
+            warnings.warn(
+                "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2."
+            )
         # Adjust calib_size to match dataset length by extending or truncating as needed
         args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[
             : len(args.dataset)
         ]
-
-        if is_nemotron_vl_model:
-            # Load processor for Nemotron VL models (like Nemotron-Parse)
-            processor = get_processor(
-                args.pyt_ckpt_path,
-                model_type,
-                device,
-                trust_remote_code=args.trust_remote_code,
-            )
-
         tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code)
 
         default_padding_side = tokenizer.padding_side
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
index 2d3d9f82c..529efeb15 100644
--- a/examples/llm_ptq/vlm_utils.py
+++ b/examples/llm_ptq/vlm_utils.py
@@ -126,11 +126,7 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
         else:
             processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
-            # Check if this is Nemotron-Parse (uses task prompts instead of chat templates)
-            config = model.config
-            architectures = getattr(config, "architectures", [])
-            is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
+            # is_nemotron_parse was already computed above
             if is_nemotron_parse:
                 # Nemotron-Parse uses a specific task prompt format
                 # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example
diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 40c313ad2..5dac1b933 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -147,7 +147,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
     if hasattr(model, "language_model"):
         return [model, model.language_model]
 
-    # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model
+    # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model.
+    # Note: This is safe because this function is only called when the model is already detected as a VLM.
+    # Non-VLM encoder-decoder models (T5, Bart) won't reach this code path.
     if hasattr(model, "decoder"):
         return [model, model.decoder]
 

From 3dd8758653cd8edd9844196a555168ce9b4777a9 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Tue, 10 Feb 2026 11:30:04 -0800
Subject: [PATCH 08/11] make image-text calib default for VLMs, further
 simplify

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/example_utils.py          | 144 +++++++--------------
 examples/llm_ptq/hf_ptq.py                 |   8 +-
 examples/llm_ptq/vlm_utils.py              |  92 ++++---------
 modelopt/torch/export/unified_export_hf.py |  85 ++++--------
 4 files changed, 103 insertions(+), 226 deletions(-)

diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py
index e4e7fe1b9..71755a02f 100755
--- a/examples/llm_ptq/example_utils.py
+++ b/examples/llm_ptq/example_utils.py
@@ -68,39 +68,26 @@ def run_nemotron_vl_preview(
     """
     from vlm_utils import run_text_only_generation, run_vl_preview_generation
 
-    # Check if this is Nemotron-Parse (encoder-decoder model that requires images)
-    config = full_model.config
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
+    print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
+    question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
+    generation_config = {
+        "max_new_tokens": 100,
+        "do_sample": False,
+        "eos_token_id": tokenizer.eos_token_id,
+    }
+
+    # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse)
+    text_response = run_text_only_generation(
+        full_model, tokenizer, question, generation_config, pyt_ckpt_path
+    )
 
     generated_ids = None
-
-    if not is_nemotron_parse:
-        # Only try text-only generation for models that support it (not Nemotron-Parse)
-        print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...")
-        question = tokenizer.decode(input_ids[0], skip_special_tokens=True)
-        generation_config = {
-            "max_new_tokens": 100,
-            "do_sample": False,
-            "eos_token_id": tokenizer.eos_token_id,
-        }
-
-        # Try text-only generation
-        text_response = run_text_only_generation(
-            full_model, tokenizer, question, generation_config, pyt_ckpt_path
-        )
-
-        if text_response is not None:
-            print(f"✅ Text-only generation successful: {text_response[:100]}...")
-            generated_ids = text_response
-        elif allow_fallback:
-            print("Text-only generation failed, falling back to standard generate...")
-            generated_ids = full_model.generate(input_ids, max_new_tokens=100)
-    else:
-        print(
-            f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - "
-            "this encoder-decoder model requires images for all operations."
-        )
+    if text_response is not None:
+        print(f"✅ Text-only generation successful: {text_response[:100]}...")
+        generated_ids = text_response
+    elif allow_fallback:
+        print("Text-only generation failed, falling back to standard generate...")
+        generated_ids = full_model.generate(input_ids, max_new_tokens=100)
 
     # Run additional VL test with images
     print(f"Running additional VL test with images ({stage_name})...")
@@ -111,10 +98,6 @@ def run_nemotron_vl_preview(
 
 def _is_multimodal_config(config):
     """Check if a config indicates a multimodal model (config-only version of is_multimodal_model)."""
-    # Check for Nemotron-Parse encoder-decoder architecture
-    architectures = getattr(config, "architectures", [])
-    is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
     return (
         hasattr(config, "vision_config")  # Standard vision config (e.g., Qwen2.5-VL)
         or getattr(config, "model_type", "") == "phi4mm"  # Phi-4 multimodal
@@ -123,7 +106,10 @@ def _is_multimodal_config(config):
         or (
             hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer")
         )  # Image embedding layers
-        or is_nemotron_parse  # Nemotron-Parse conditional generation model
+        or getattr(config, "is_encoder_decoder", False)  # Encoder-decoder VL models
+        or any(  # Architecture-based detection for custom VL models (e.g., Nemotron-Parse)
+            "conditionalgeneration" in arch.lower() for arch in getattr(config, "architectures", [])
+        )
     )
 
 
@@ -176,9 +162,20 @@ def calibrate_loop(_model):
         )
         allowed_keys = set(forward_params.keys())
 
+        # Check if model is encoder-decoder (needs decoder_input_ids instead of input_ids)
+        is_enc_dec = getattr(full_model.config, "is_encoder_decoder", False)
+
         full_model.eval()
         with torch.no_grad():
             for batch in calib_dataloader:
+                # For encoder-decoder models, rename input_ids → decoder_input_ids
+                # and disable KV caching to avoid tuple index errors in decoder layers
+                if is_enc_dec and "input_ids" in batch and "pixel_values" in batch:
+                    batch["decoder_input_ids"] = batch.pop("input_ids")
+                    if "attention_mask" in batch:
+                        batch["decoder_attention_mask"] = batch.pop("attention_mask")
+                    batch["use_cache"] = False
+
                 # Filter batch to only include parameters the model accepts
                 if accepts_kwargs:
                     call_kwargs = batch
@@ -190,10 +187,8 @@ def calibrate_loop(_model):
                 # Use safe_nemotron_vl_forward for Nemotron Nano VL (embedding-injection style)
                 # For other VLMs (like Nemotron-Parse), use standard forward
                 if hasattr(full_model, "img_context_token_id"):
-                    # Nemotron Nano VL style
                     safe_nemotron_vl_forward(full_model, call_kwargs)
                 else:
-                    # Standard encoder-decoder or other VLM architectures
                     full_model(**call_kwargs)
 
     return calibrate_loop
@@ -276,20 +271,9 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok
     if "vila" in ckpt_path.lower():
         ckpt_path += "/llm"
 
-    # Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading.
-    # Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy.
-    if trust_remote_code:
-        import contextlib
-        import io
-
-        with contextlib.redirect_stdout(io.StringIO()):
-            tokenizer = AutoTokenizer.from_pretrained(
-                ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-            )
-    else:
-        tokenizer = AutoTokenizer.from_pretrained(
-            ckpt_path, trust_remote_code=trust_remote_code, **kwargs
-        )
+    tokenizer = AutoTokenizer.from_pretrained(
+        ckpt_path, trust_remote_code=trust_remote_code, **kwargs
+    )
 
     # can't set attribute 'pad_token' for "<unk>"
     # We skip this step for Nemo models
@@ -342,18 +326,9 @@ def get_processor(
 
         return MllamaImageProcessor(processor, device)
     else:
-        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse).
-        # Suppress stdout for trust_remote_code models where custom processor code may be noisy.
-        import contextlib
-        import io
-
+        # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse)
         try:
-            if model_kwargs.get("trust_remote_code", False):
-                with contextlib.redirect_stdout(io.StringIO()):
-                    processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
-            else:
-                processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
-
+            processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs)
             print(f"Loaded AutoProcessor for model type: {model_type}")
             return processor
         except Exception as e:
@@ -493,22 +468,12 @@ def get_model(
     try:
         hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs)
 
-        # Check specifically for Nemotron-Parse
-        architectures = getattr(hf_config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
         if is_nemotron_vl(hf_config):
-            if is_nemotron_parse:
-                # Nemotron-Parse works fine with device_map="auto"
-                # Keep device_map="auto" to ensure proper device placement
-                print("Detected Nemotron-Parse model from config. Using automatic device mapping.")
-            else:
-                # For other Nemotron VL models, disable device_map for compatibility
-                print(
-                    "Detected Nemotron VL model from config. "
-                    "Disabling automatic device mapping for compatibility."
-                )
-                device_map = None
+            print(
+                "Detected Nemotron VL model from config. "
+                "Disabling automatic device mapping for compatibility."
+            )
+            device_map = None
     except Exception as e:
         print(f"Error: Could not load config from {ckpt_path}: {e}")
         raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e
@@ -564,13 +529,17 @@ def get_model(
                 if not hasattr(transformers, architecture):
                     warnings.warn(
                         f"Architecture {architecture} not found in transformers: {transformers.__version__}. "
-                        "Falling back to AutoModel."
+                        "Falling back to AutoModelForCausalLM (or AutoModel for non-causal architectures)."
                     )
                 assert trust_remote_code, (
                     "Please set trust_remote_code to True if you want to use this architecture"
                 )
 
-                auto_model_module = AutoModel
+                # Use AutoModelForCausalLM for causal LMs, AutoModel for encoder-decoder models
+                if getattr(hf_config, "is_encoder_decoder", False):
+                    auto_model_module = AutoModel
+                else:
+                    auto_model_module = AutoModelForCausalLM
                 from_config = auto_model_module.from_config
             else:
                 auto_model_module = getattr(transformers, architecture)
@@ -617,21 +586,6 @@ def get_model(
         print(f"Moving model to {device} device...")
         model = model.to(device)
 
-    # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device
-    # The RADIO encoder has buffers that might not be properly moved even with device_map="auto"
-    # This is because custom RADIO modules might not fully support accelerate's device_map
-    if device != "cpu" and hasattr(model, "encoder"):
-        # Check if encoder has any buffers on CPU
-        cpu_buffers = []
-        for name, buffer in model.encoder.named_buffers():
-            if buffer.device.type == "cpu":
-                cpu_buffers.append(name)
-
-        if cpu_buffers:
-            print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...")
-            model.encoder = model.encoder.to(device)
-            print(f"Encoder moved to {device}")
-
     if device == "cuda" and not is_model_on_gpu(model):
         print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM")
 
diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 0a414e408..664dd04f0 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -361,6 +361,12 @@ def load_model(args: argparse.Namespace):
     default_pad_token = None
 
     is_nemotron_vl_model = is_nemotron_vl(full_model)
+
+    # Default to image-text calibration for VLM models
+    if is_nemotron_vl_model and not args.calib_with_images:
+        print("Nemotron VL model detected. Enabling image-text calibration by default.")
+        args.calib_with_images = True
+
     if model_type == "mllama":
         processor = get_processor(
             args.pyt_ckpt_path,
@@ -689,7 +695,7 @@ def pre_quantize(
             preview_input_ids,
             args.pyt_ckpt_path,
             "before quantization",
-            allow_fallback=True,
+            allow_fallback=False,
         )
     else:
         # Standard generation for non-Nemotron VL models
diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py
index 529efeb15..9919e405b 100644
--- a/examples/llm_ptq/vlm_utils.py
+++ b/examples/llm_ptq/vlm_utils.py
@@ -18,7 +18,7 @@
 import os
 
 from PIL import Image
-from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig
+from transformers import AutoImageProcessor, AutoProcessor
 
 
 def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
@@ -73,34 +73,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             print("   Skipping VL preview generation.")
             return None
 
-        # Check if this is Nemotron-Parse early to set up proper generation config
-        config = model.config
-        architectures = getattr(config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
         # Generate response
         question = "Describe this image briefly."  # Updated for single image
-
-        # Use model's GenerationConfig for Nemotron-Parse, dict for others
-        if is_nemotron_parse:
-            try:
-                generation_config = GenerationConfig.from_pretrained(
-                    model_path, trust_remote_code=True
-                )
-                print("Using Nemotron-Parse GenerationConfig from model")
-            except Exception as e:
-                print(f"Warning: Could not load GenerationConfig: {e}, using defaults")
-                generation_config = {
-                    "max_new_tokens": 50,
-                    "do_sample": False,
-                    "eos_token_id": tokenizer.eos_token_id,
-                }
-        else:
-            generation_config = {
-                "max_new_tokens": 50,
-                "do_sample": False,
-                "eos_token_id": tokenizer.eos_token_id,
-            }
+        generation_config = {
+            "max_new_tokens": 50,
+            "do_sample": False,
+            "eos_token_id": tokenizer.eos_token_id,
+        }
 
         print(f"Generating VL response ({stage_name})...")
 
@@ -126,14 +105,8 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
         else:
             processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
 
-            # is_nemotron_parse was already computed above
-            if is_nemotron_parse:
-                # Nemotron-Parse uses a specific task prompt format
-                # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example
-                prompt = "</s><s><predict_bbox><predict_classes><output_markdown>"
-                print(f"Using Nemotron-Parse task prompt: {prompt}")
-            else:
-                # Other VL models use chat templates
+            # Use chat template if available, otherwise fall back to default task prompt
+            if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None:
                 messages = [
                     {"role": "system", "content": "/no_think"},
                     {
@@ -150,11 +123,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
                         ],
                     },
                 ]
-
-                # Apply chat template
                 prompt = tokenizer.apply_chat_template(
                     messages, tokenize=False, add_generation_prompt=True
                 )
+            else:
+                # For models without chat templates (e.g., encoder-decoder VL models),
+                # use the tokenizer's bos/eos tokens as a minimal prompt
+                prompt = (tokenizer.bos_token or "") + question
 
             # Process inputs using the processor with single image
             inputs = processor(
@@ -175,22 +150,12 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
                 )
 
             # Generate response using model.generate
-            if isinstance(generation_config, GenerationConfig):
-                # For Nemotron-Parse with GenerationConfig object
-                generated_ids = model.generate(
-                    pixel_values=inputs.pixel_values,
-                    input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    generation_config=generation_config,
-                )
-            else:
-                # For other models with dict generation config
-                generated_ids = model.generate(
-                    pixel_values=inputs.pixel_values,
-                    input_ids=inputs.input_ids,
-                    attention_mask=inputs.attention_mask,
-                    **generation_config,
-                )
+            generated_ids = model.generate(
+                pixel_values=inputs.pixel_values,
+                input_ids=inputs.input_ids,
+                attention_mask=inputs.attention_mask,
+                **generation_config,
+            )
 
             # Decode the response (trim input tokens like in the working example)
             if generated_ids is None:
@@ -199,20 +164,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name):
             generated_ids_trimmed = [
                 out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
             ]
-
-            # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode
-            if is_nemotron_parse and hasattr(tokenizer, "batch_decode"):
-                output_text = tokenizer.batch_decode(
-                    generated_ids_trimmed,
-                    skip_special_tokens=True,
-                    clean_up_tokenization_spaces=False,
-                )
-            else:
-                output_text = processor.batch_decode(
-                    generated_ids_trimmed,
-                    skip_special_tokens=True,
-                    clean_up_tokenization_spaces=False,
-                )
+            # Use processor.batch_decode if available, otherwise fall back to tokenizer
+            decoder = processor if hasattr(processor, "batch_decode") else tokenizer
+            output_text = decoder.batch_decode(
+                generated_ids_trimmed,
+                skip_special_tokens=True,
+                clean_up_tokenization_spaces=False,
+            )
 
             if output_text is None or len(output_text) == 0:
                 raise ValueError("Decoding returned empty output")
diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py
index 878970dd3..b6b92f6ff 100644
--- a/modelopt/torch/export/unified_export_hf.py
+++ b/modelopt/torch/export/unified_export_hf.py
@@ -316,13 +316,9 @@ def llm_dummy_forward():
                 [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype
             ).to(model.device)
 
-        # Check if this is Nemotron-Parse (encoder-decoder VL model)
-        architectures = getattr(model.config, "architectures", [])
-        is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures)
-
-        if is_vl_model and ("nemotron" in model_type or is_nemotron_parse):
-            # For Nemotron VL models (including Nemotron-Parse), run optimization on just the
-            # language model/decoder. This avoids needing pixel_values for the vision encoder.
+        if is_vl_model and "nemotron" in model_type:
+            # For Nemotron VL models, run optimization on just the language model/decoder.
+            # This avoids needing pixel_values for the vision encoder.
             language_model_lineage = get_language_model_from_vl(model)
 
             if language_model_lineage is not None:
@@ -330,11 +326,8 @@ def llm_dummy_forward():
                 print(
                     f"Running optimization on language model with fake_input shape: {fake_input.shape}"
                 )
-                # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors
-                if is_nemotron_parse:
-                    language_model(fake_input, use_cache=False)
-                else:
-                    language_model(fake_input)
+                # Pass use_cache=False to avoid KV cache issues in encoder-decoder models
+                language_model(fake_input, use_cache=False)
             else:
                 raise ValueError(
                     f"Cannot extract language_model from Nemotron VL model (type: {model_type}). "
@@ -412,42 +405,25 @@ def _export_quantized_weight(
 
     if quantization_format == QUANTIZATION_FP8:
         # Convert amax to float32
-        # Note: Use the public 'amax' property, not the private '_amax' attribute
-        if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None:
-            weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
-            amax_tensor = weight_quantizer._amax
-        else:
-            # Fallback to public amax property
-            amax_tensor = weight_quantizer.amax
-            if amax_tensor is not None and hasattr(amax_tensor, "to"):
-                amax_tensor = amax_tensor.to(torch.float32)
-
-        # Only compute scaling factor if amax_tensor is valid
-        if amax_tensor is not None and hasattr(amax_tensor, "dim"):
-            if amax_tensor.dim() == 1:
-                # Per-tensor amax
-                weight_scaling_factor = torch.tensor(
-                    weight_quantizer.amax.item() / weight_quantizer.maxbound
-                )
-            else:
-                # Per-channel amax
-                weight_scaling_factor = torch.tensor(
-                    weight_quantizer.amax / weight_quantizer.maxbound
-                )
+        weight_quantizer._amax = weight_quantizer._amax.to(torch.float32)
 
-            sub_module.register_buffer(
-                quantizer_attrs.weight_scale,
-                weight_scaling_factor,
+        if weight_quantizer._amax.dim() == 1:
+            # Per-tensor amax
+            weight_scaling_factor = torch.tensor(
+                weight_quantizer.amax.item() / weight_quantizer.maxbound
             )
+        else:
+            # Per-channel amax
+            weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound)
 
-        if hasattr(input_quantizer, "_amax") or (
-            input_quantizer is not None
-            and hasattr(input_quantizer, "amax")
-            and input_quantizer.amax is not None
-        ):
+        sub_module.register_buffer(
+            quantizer_attrs.weight_scale,
+            weight_scaling_factor,
+        )
+
+        if hasattr(input_quantizer, "_amax"):
             assert input_quantizer is not None
-            if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None:
-                input_quantizer._amax = input_quantizer._amax.to(torch.float32)
+            input_quantizer._amax = input_quantizer._amax.to(torch.float32)
 
             sub_module.register_buffer(
                 quantizer_attrs.input_scale,
@@ -456,14 +432,9 @@ def _export_quantized_weight(
                 ).squeeze(),
             )
 
-        if hasattr(output_quantizer, "_amax") or (
-            output_quantizer is not None
-            and hasattr(output_quantizer, "amax")
-            and output_quantizer.amax is not None
-        ):
+        if hasattr(output_quantizer, "_amax"):
             assert output_quantizer is not None
-            if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None:
-                output_quantizer._amax = output_quantizer._amax.to(torch.float32)
+            output_quantizer._amax = output_quantizer._amax.to(torch.float32)
     else:
         # Register weight_scale and input_scale
         if quantization_format == QUANTIZATION_FP8_PB_REAL:
@@ -514,18 +485,6 @@ def _export_quantized_weight(
     weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None)
     weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None)
 
-    # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module
-    # This can happen for modules that were disabled from quantization or have invalid calibration data
-    if weight_scale is None and quantization_format not in [
-        QUANTIZATION_NVFP4,
-        QUANTIZATION_NVFP4_AWQ,
-    ]:
-        # For NVFP4, weight_scale is computed later, so we can't check here
-        print(
-            f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found"
-        )
-        return
-
     # Transpose weight for bmm-style expert quantization (llama4, gpt-oss)
     # Check if this is a BMM-style expert weight that needs transposition
     is_bmm_expert_weight = weight.dim() == 3 and any(

From e94fbc17d639f65d7dc875bb5c4cedc47354e2ea Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 11 Feb 2026 14:40:58 -0800
Subject: [PATCH 09/11] use batch_size = 1 for calib_with_images

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 examples/llm_ptq/hf_ptq.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py
index 664dd04f0..de434e1cf 100755
--- a/examples/llm_ptq/hf_ptq.py
+++ b/examples/llm_ptq/hf_ptq.py
@@ -809,14 +809,10 @@ def quantize_main(
     device: torch.device,
 ):
     if args.batch_size == 0:
-        # Check if this is a vision-language model
-        # For VL models, skip automatic batch size detection and use a conservative default
-        # since proper multimodal input preparation is complex
-        if is_multimodal_model(full_model) or is_nemotron_vl(full_model):
-            print(
-                "Vision-language model detected. Using default batch_size=1 for calibration "
-                "to ensure proper handling of multimodal inputs."
-            )
+        # For VL models with image-text calibration, skip automatic batch size detection
+        # since get_max_batch_size can't handle multimodal inputs
+        if args.calib_with_images:
+            print("Image-text calibration enabled. Using default batch_size=1 for calibration.")
             args.batch_size = 1
         else:
             # Calibration/sparsification will actually take much more memory than regular inference

From 0666b5590e3829c29614cb2fc18f12bb3486ddc2 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Wed, 11 Feb 2026 23:23:13 -0800
Subject: [PATCH 10/11] fix ci

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 modelopt/torch/export/model_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py
index 5dac1b933..6cb5be9a5 100755
--- a/modelopt/torch/export/model_utils.py
+++ b/modelopt/torch/export/model_utils.py
@@ -148,9 +148,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None:
         return [model, model.language_model]
 
     # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model.
-    # Note: This is safe because this function is only called when the model is already detected as a VLM.
-    # Non-VLM encoder-decoder models (T5, Bart) won't reach this code path.
-    if hasattr(model, "decoder"):
+    # Only match if the model is detected as multimodal to avoid matching non-VLM encoder-decoder
+    # models like T5, Bart, Whisper which also have .decoder.
+    if hasattr(model, "decoder") and is_multimodal_model(model):
         return [model, model.decoder]
 
     # Pattern 4: No language_model found

From eef7a786d4fbd7c918b07fe548804e5ab6bcdb75 Mon Sep 17 00:00:00 2001
From: Zhiyu Cheng <zhiyuc@nvidia.com>
Date: Thu, 12 Feb 2026 16:18:40 -0800
Subject: [PATCH 11/11] update changelog

Signed-off-by: Zhiyu Cheng <zhiyuc@nvidia.com>
---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 9d7500e58..bbbe6ab9e 100755
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -21,6 +21,7 @@ NVIDIA Model Optimizer Changelog (Linux)
 - Add LTX-2 and Wan2.2 (T2V) support in the diffusers quantization workflow.
 - Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is.
 - Add support for image-text data calibration in PTQ for Nemotron VL models.
+- Add PTQ support for Nemotron Parse.
 
 0.41 (2026-01-19)
 ^^^^^^^^^^^^^^^^^