From dc84b6fc5325db1d2c68378ceaec7ba5e92b3034 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 14:35:04 -0800 Subject: [PATCH 01/11] add support for nemotron parse fp8 Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 79 +++++++++---- examples/llm_ptq/hf_ptq.py | 116 +++++++++++++----- examples/llm_ptq/vlm_utils.py | 131 +++++++++++++++------ modelopt/torch/export/model_utils.py | 12 +- modelopt/torch/export/unified_export_hf.py | 84 ++++++++----- 5 files changed, 302 insertions(+), 120 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 93687a8d0..cfd218abf 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -31,6 +31,7 @@ from safetensors.torch import load_file from transformers import ( AutoConfig, + AutoModel, AutoModelForCausalLM, AutoProcessor, AutoTokenizer, @@ -67,27 +68,39 @@ def run_nemotron_vl_preview( """ from vlm_utils import run_text_only_generation, run_vl_preview_generation - print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...") - question = tokenizer.decode(input_ids[0], skip_special_tokens=True) - generation_config = { - "max_new_tokens": 100, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } - - # Try text-only generation - text_response = run_text_only_generation( - full_model, tokenizer, question, generation_config, pyt_ckpt_path - ) + # Check if this is Nemotron-Parse (encoder-decoder model that requires images) + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + generated_ids = None + + if not is_nemotron_parse: + # Only try text-only generation for models that support it (not Nemotron-Parse) + print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...") + question = tokenizer.decode(input_ids[0], skip_special_tokens=True) + generation_config = { + "max_new_tokens": 100, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } + + # Try text-only generation + text_response = run_text_only_generation( + full_model, tokenizer, question, generation_config, pyt_ckpt_path + ) - if text_response is not None: - print(f"✅ Text-only generation successful: {text_response[:100]}...") - generated_ids = text_response - elif allow_fallback: - print("Text-only generation failed, falling back to standard generate...") - generated_ids = full_model.generate(input_ids, max_new_tokens=100) + if text_response is not None: + print(f"✅ Text-only generation successful: {text_response[:100]}...") + generated_ids = text_response + elif allow_fallback: + print("Text-only generation failed, falling back to standard generate...") + generated_ids = full_model.generate(input_ids, max_new_tokens=100) else: - generated_ids = None + print( + f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - " + "this encoder-decoder model requires images for all operations." + ) # Run additional VL test with images print(f"Running additional VL test with images ({stage_name})...") @@ -98,6 +111,10 @@ def run_nemotron_vl_preview( def _is_multimodal_config(config): """Check if a config indicates a multimodal model (config-only version of is_multimodal_model).""" + # Check for Nemotron-Parse encoder-decoder architecture + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + return ( hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL) or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal @@ -106,6 +123,7 @@ def _is_multimodal_config(config): or ( hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer") ) # Image embedding layers + or is_nemotron_parse # Nemotron-Parse conditional generation model ) @@ -312,8 +330,19 @@ def get_processor( ) return MllamaImageProcessor(processor, device) - - return None + else: + # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) + # This will only work if the model has a processor config + try: + processor = AutoProcessor.from_pretrained( + ckpt_path, + **model_kwargs, + ) + print(f"Loaded AutoProcessor for model type: {model_type}") + return processor + except Exception as e: + print(f"Could not load processor for {model_type}: {e}") + return None def load_mtp_weights( @@ -466,8 +495,6 @@ def get_model( model_kwargs.setdefault("torch_dtype", "auto") if "vila" in ckpt_path.lower(): - from transformers import AutoModel - hf_vila = AutoModel.from_pretrained( ckpt_path, device_map=device_map, @@ -510,13 +537,13 @@ def get_model( if not hasattr(transformers, architecture): warnings.warn( f"Architecture {architecture} not found in transformers: {transformers.__version__}. " - "Falling back to AutoModelForCausalLM." + "Falling back to AutoModel." ) assert trust_remote_code, ( "Please set trust_remote_code to True if you want to use this architecture" ) - auto_model_module = AutoModelForCausalLM + auto_model_module = AutoModel from_config = auto_model_module.from_config else: auto_model_module = getattr(transformers, architecture) @@ -527,7 +554,7 @@ def get_model( # unless specified by the hf_config. torch_dtype = getattr(hf_config, "torch_dtype", torch.bfloat16) model_kwargs2 = model_kwargs.copy() - if auto_model_module != AutoModelForCausalLM: + if auto_model_module not in [AutoModelForCausalLM, AutoModel]: model_kwargs2.pop("trust_remote_code", None) model_kwargs2["torch_dtype"] = torch_dtype model_kwargs2.pop("max_memory", None) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index d9a6ca893..b4be53ecf 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -411,6 +411,18 @@ def load_model(args: argparse.Namespace): args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[ : len(args.dataset) ] + + # Check if this is a Nemotron VL model that needs a processor + is_nemotron_vl_model = is_nemotron_vl(full_model) + if is_nemotron_vl_model: + # Load processor for Nemotron VL models (like Nemotron-Parse) + processor = get_processor( + args.pyt_ckpt_path, + model_type, + device, + trust_remote_code=args.trust_remote_code, + ) + tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code) default_padding_side = tokenizer.padding_side @@ -670,10 +682,20 @@ def pre_quantize( post-quantize generation. """ + # Check if this is Nemotron-Parse (encoder-decoder model) + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + # Only run single sample for preview - preview_input_ids = next(iter(calib_dataloader))[ - "input_features" if model_type == "whisper" else "input_ids" - ][0:1] + # For Nemotron-Parse, use decoder_input_ids instead of input_ids + sample_batch = next(iter(calib_dataloader)) + if is_nemotron_parse and "decoder_input_ids" in sample_batch: + preview_input_ids = sample_batch["decoder_input_ids"][0:1] + elif model_type == "whisper": + preview_input_ids = sample_batch["input_features"][0:1] + else: + preview_input_ids = sample_batch["input_ids"][0:1] # Generate preview before quantization if model_type == "deepseek": @@ -800,36 +822,46 @@ def quantize_main( device: torch.device, ): if args.batch_size == 0: - # Calibration/sparsification will actually take much more memory than regular inference - # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio - # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference. - sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1 - # Whisper model expects mel-spectrogram input features of length 3000 - # Whisper model needs input of shape (batch_size, num_mel_bins, 3000) - # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float - # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size() - if model_type == "whisper": - max_sample_length = 3000 - num_mel_bins = language_model.config.num_mel_bins - sample_input_single_batch = ( - torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to( - language_model.device - ) - * 100 + # Check if this is a vision-language model + # For VL models, skip automatic batch size detection and use a conservative default + # since proper multimodal input preparation is complex + if is_multimodal_model(full_model) or is_nemotron_vl(full_model): + print( + "Vision-language model detected. Using default batch_size=1 for calibration " + "to ensure proper handling of multimodal inputs." ) + args.batch_size = 1 else: - sample_input_single_batch = None + # Calibration/sparsification will actually take much more memory than regular inference + # due to intermediate tensors for fake quantization. Setting sample_memory_usage_ratio + # to 2 to avoid OOM for AWQ/SmoothQuant fake quantization as it will take more memory than inference. + sample_memory_usage_ratio = 2 if "awq" in args.qformat or "sq" in args.qformat else 1.1 + # Whisper model expects mel-spectrogram input features of length 3000 + # Whisper model needs input of shape (batch_size, num_mel_bins, 3000) + # As the encoder of Whisper doesn't have embedding layer, input dtype has to be float + # For non-Whisper models (language models), sample_input will be set up inside get_max_batch_size() + if model_type == "whisper": + max_sample_length = 3000 + num_mel_bins = language_model.config.num_mel_bins + sample_input_single_batch = ( + torch.ones([1, num_mel_bins, max_sample_length], dtype=language_model.dtype).to( + language_model.device + ) + * 100 + ) + else: + sample_input_single_batch = None - run_auto_quant = args.auto_quantize_bits is not None + run_auto_quant = args.auto_quantize_bits is not None - args.batch_size = get_max_batch_size( - language_model, - max_sample_length=args.calib_seq, - sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0, - sample_input_single_batch=sample_input_single_batch, - enable_grad=run_auto_quant, - ) - args.batch_size = min(args.batch_size, sum(args.calib_size)) + args.batch_size = get_max_batch_size( + language_model, + max_sample_length=args.calib_seq, + sample_memory_usage_ratio=sample_memory_usage_ratio if not run_auto_quant else 1.0, + sample_input_single_batch=sample_input_single_batch, + enable_grad=run_auto_quant, + ) + args.batch_size = min(args.batch_size, sum(args.calib_size)) print(f"Use calib batch_size {args.batch_size}") @@ -840,6 +872,32 @@ def quantize_main( # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) + # For Nemotron-Parse, wrap the text-only dataloader to add dummy images + # Nemotron-Parse is an encoder-decoder model that requires pixel_values + if is_nemotron_vl_model and processor is not None: + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + if is_nemotron_parse: + # Check if we're quantizing just the decoder or the full model + decoder_only = language_model is not full_model + + if decoder_only: + print( + "Calibration will use text-only inputs for Nemotron-Parse decoder. " + "Vision encoder is excluded from quantization." + ) + else: + print( + "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. " + "Nemotron-Parse requires pixel_values for full model calibration." + ) + + calib_dataloader = create_nemotron_parse_calib_wrapper( + calib_dataloader, processor, device, decoder_only=decoder_only + ) + preview_input_ids, generated_ids_before_ptq = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 6c9d921b8..4789130cd 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -18,7 +18,7 @@ import os from PIL import Image -from transformers import AutoImageProcessor, AutoProcessor +from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig def run_vl_preview_generation(model, tokenizer, model_path, stage_name): @@ -73,13 +73,34 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): print(" Skipping VL preview generation.") return None + # Check if this is Nemotron-Parse early to set up proper generation config + config = model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + # Generate response question = "Describe this image briefly." # Updated for single image - generation_config = { - "max_new_tokens": 50, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } + + # Use model's GenerationConfig for Nemotron-Parse, dict for others + if is_nemotron_parse: + try: + generation_config = GenerationConfig.from_pretrained( + model_path, trust_remote_code=True + ) + print("Using Nemotron-Parse GenerationConfig from model") + except Exception as e: + print(f"Warning: Could not load GenerationConfig: {e}, using defaults") + generation_config = { + "max_new_tokens": 50, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } + else: + generation_config = { + "max_new_tokens": 50, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } print(f"Generating VL response ({stage_name})...") @@ -105,27 +126,39 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): else: processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - messages = [ - {"role": "system", "content": "/no_think"}, - { - "role": "user", - "content": [ - { - "type": "image", - "image": "", - }, - { - "type": "text", - "text": question, - }, - ], - }, - ] + # Check if this is Nemotron-Parse (uses task prompts instead of chat templates) + config = model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - # Apply chat template - prompt = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) + if is_nemotron_parse: + # Nemotron-Parse uses a specific task prompt format + # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example + prompt = "" + print(f"Using Nemotron-Parse task prompt: {prompt}") + else: + # Other VL models use chat templates + messages = [ + {"role": "system", "content": "/no_think"}, + { + "role": "user", + "content": [ + { + "type": "image", + "image": "", + }, + { + "type": "text", + "text": question, + }, + ], + }, + ] + + # Apply chat template + prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) # Process inputs using the processor with single image inputs = processor( @@ -139,21 +172,49 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): inputs = inputs.to(model_device) print(f" Moved inputs to {model_device}") + # Verify we have pixel_values for the vision encoder + if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None: + raise ValueError("Processor did not generate pixel_values. Check processor configuration.") + # Generate response using model.generate - generated_ids = model.generate( - pixel_values=inputs.pixel_values, - input_ids=inputs.input_ids, - attention_mask=inputs.attention_mask, - **generation_config, - ) + if isinstance(generation_config, GenerationConfig): + # For Nemotron-Parse with GenerationConfig object + generated_ids = model.generate( + pixel_values=inputs.pixel_values, + input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask, + generation_config=generation_config, + ) + else: + # For other models with dict generation config + generated_ids = model.generate( + pixel_values=inputs.pixel_values, + input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask, + **generation_config, + ) # Decode the response (trim input tokens like in the working example) + if generated_ids is None: + raise ValueError("Model generate returned None") + generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] - output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False - ) + + # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode + if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'): + output_text = tokenizer.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + else: + output_text = processor.batch_decode( + generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + ) + + if output_text is None or len(output_text) == 0: + raise ValueError("Decoding returned empty output") + response = output_text[0] print(f"✅ VL generation {stage_name} successful!") diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 5a24429ad..40c313ad2 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -85,6 +85,7 @@ def is_multimodal_model(model): - Vision LoRA configurations - Audio processing capabilities - Image embedding layers + - Nemotron-Parse conditional generation models Args: model: The HuggingFace model instance to check @@ -103,6 +104,10 @@ def is_multimodal_model(model): """ config = model.config + # Check for Nemotron-Parse encoder-decoder architecture + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + return ( hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL) or hasattr(model, "language_model") # Language model attribute (e.g., LLaVA) @@ -112,6 +117,7 @@ def is_multimodal_model(model): or ( hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer") ) # Image embedding layers + or is_nemotron_parse # Nemotron-Parse conditional generation model ) @@ -141,5 +147,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None: if hasattr(model, "language_model"): return [model, model.language_model] - # Pattern 3: No language_model found + # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model + if hasattr(model, "decoder"): + return [model, model.decoder] + + # Pattern 4: No language_model found return None diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 5703f4515..f906c6797 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -148,13 +148,13 @@ def _collect_shared_input_modules( def _input_hook(module, input, output): """Update dictionary with list of all modules that share the same input.""" if len(input) > 0 and isinstance(input[0], torch.Tensor): - # TODO: Handle DBRX MoE case - input_to_linear[input[0]].append(module) + # TODO: Handle DBRX MoE case + input_to_linear[input[0]].append(module) def _output_hook(module, input, output): """Update dictionary with mapping of layernorms and their outputs.""" if output_to_layernorm is not None and isinstance(output, torch.Tensor): - output_to_layernorm[output] = module + output_to_layernorm[output] = module handles = [] @@ -316,29 +316,36 @@ def llm_dummy_forward(): [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype ).to(model.device) - if getattr(model.config, "is_encoder_decoder", False): - # For encoder-decoder models, we need to pass both the encoder and decoder input ids + # Check if this is Nemotron-Parse (encoder-decoder VL model) + architectures = getattr(model.config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + + if is_vl_model and ("nemotron" in model_type or is_nemotron_parse): + # For Nemotron VL models (including Nemotron-Parse), run optimization on just the + # language model/decoder. This avoids needing pixel_values for the vision encoder. + language_model_lineage = get_language_model_from_vl(model) + + if language_model_lineage is not None: + language_model = language_model_lineage[-1] + print( + f"Running optimization on language model with fake_input shape: {fake_input.shape}" + ) + # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors + if is_nemotron_parse: + language_model(fake_input, use_cache=False) + else: + language_model(fake_input) + else: + raise ValueError( + f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " + "This is required for requantization/resmoothing optimization. " + "Please ensure the model architecture is supported or file an issue." + ) + elif getattr(model.config, "is_encoder_decoder", False): + # For other encoder-decoder models (non-VL), pass both encoder and decoder input ids model(fake_input, decoder_input_ids=decoder_fake_input) - elif is_vl_model and "nemotron" in model_type: - # For Nemotron VL models, try to run optimization on just the language model part - language_model_lineage = get_language_model_from_vl(model) - - if language_model_lineage is not None: - # Run optimization on just the language model with the same input format as regular LLMs - # Use the same fake_input tensor that regular LLMs use - language_model = language_model_lineage[-1] - print( - f"Running optimization on language model with fake_input shape: {fake_input.shape}" - ) - language_model(fake_input) else: - raise ValueError( - f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " - "This is required for requantization/resmoothing optimization. " - "Please ensure the model architecture is supported or file an issue." - ) - else: - model(fake_input) + model(fake_input) input_to_linear, output_to_layernorm = _collect_shared_input_modules( model, llm_dummy_forward, collect_layernorms=True @@ -405,9 +412,19 @@ def _export_quantized_weight( if quantization_format == QUANTIZATION_FP8: # Convert amax to float32 + # Note: Use the public 'amax' property, not the private '_amax' attribute + if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None: weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) - - if weight_quantizer._amax.dim() == 1: + amax_tensor = weight_quantizer._amax + else: + # Fallback to public amax property + amax_tensor = weight_quantizer.amax + if amax_tensor is not None and hasattr(amax_tensor, 'to'): + amax_tensor = amax_tensor.to(torch.float32) + + # Only compute scaling factor if amax_tensor is valid + if amax_tensor is not None and hasattr(amax_tensor, 'dim'): + if amax_tensor.dim() == 1: # Per-tensor amax weight_scaling_factor = torch.tensor( weight_quantizer.amax.item() / weight_quantizer.maxbound @@ -421,8 +438,9 @@ def _export_quantized_weight( weight_scaling_factor, ) - if hasattr(input_quantizer, "_amax"): + if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None): assert input_quantizer is not None + if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: input_quantizer._amax = input_quantizer._amax.to(torch.float32) sub_module.register_buffer( @@ -432,8 +450,9 @@ def _export_quantized_weight( ).squeeze(), ) - if hasattr(output_quantizer, "_amax"): + if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None): assert output_quantizer is not None + if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: output_quantizer._amax = output_quantizer._amax.to(torch.float32) else: # Register weight_scale and input_scale @@ -451,7 +470,7 @@ def _export_quantized_weight( ) sub_module.register_buffer(quantizer_attrs.weight_scale, e8m0_scale) if hasattr(weight_quantizer, "_scale") and weight_quantizer._scale is not None: - del weight_quantizer._scale + del weight_quantizer._scale else: sub_module.register_buffer( quantizer_attrs.weight_scale, get_weight_scaling_factor(sub_module, weight_name) @@ -485,6 +504,13 @@ def _export_quantized_weight( weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None) weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None) + # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module + # This can happen for modules that were disabled from quantization or have invalid calibration data + if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + # For NVFP4, weight_scale is computed later, so we can't check here + print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found") + return + # Transpose weight for bmm-style expert quantization (llama4, gpt-oss) # Check if this is a BMM-style expert weight that needs transposition is_bmm_expert_weight = weight.dim() == 3 and any( From 8ee2778d88b7ca4517268286dbb9df7a26d9b862 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 14:36:03 -0800 Subject: [PATCH 02/11] add support for nemotron parse fp8 Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/vlm_utils.py | 16 +++++--- modelopt/torch/export/unified_export_hf.py | 44 +++++++++++++++------- 2 files changed, 41 insertions(+), 19 deletions(-) diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 4789130cd..2d3d9f82c 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -173,8 +173,10 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): print(f" Moved inputs to {model_device}") # Verify we have pixel_values for the vision encoder - if not hasattr(inputs, 'pixel_values') or inputs.pixel_values is None: - raise ValueError("Processor did not generate pixel_values. Check processor configuration.") + if not hasattr(inputs, "pixel_values") or inputs.pixel_values is None: + raise ValueError( + "Processor did not generate pixel_values. Check processor configuration." + ) # Generate response using model.generate if isinstance(generation_config, GenerationConfig): @@ -203,13 +205,17 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): ] # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode - if is_nemotron_parse and hasattr(tokenizer, 'batch_decode'): + if is_nemotron_parse and hasattr(tokenizer, "batch_decode"): output_text = tokenizer.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, ) else: output_text = processor.batch_decode( - generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, ) if output_text is None or len(output_text) == 0: diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index f906c6797..6efcb1e6a 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -413,32 +413,41 @@ def _export_quantized_weight( if quantization_format == QUANTIZATION_FP8: # Convert amax to float32 # Note: Use the public 'amax' property, not the private '_amax' attribute - if hasattr(weight_quantizer, '_amax') and weight_quantizer._amax is not None: - weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) + if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: + weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) amax_tensor = weight_quantizer._amax else: # Fallback to public amax property amax_tensor = weight_quantizer.amax - if amax_tensor is not None and hasattr(amax_tensor, 'to'): + if amax_tensor is not None and hasattr(amax_tensor, "to"): amax_tensor = amax_tensor.to(torch.float32) # Only compute scaling factor if amax_tensor is valid - if amax_tensor is not None and hasattr(amax_tensor, 'dim'): + if amax_tensor is not None and hasattr(amax_tensor, "dim"): if amax_tensor.dim() == 1: - # Per-tensor amax - weight_scaling_factor = torch.tensor( - weight_quantizer.amax.item() / weight_quantizer.maxbound + # Per-tensor amax + weight_scaling_factor = torch.tensor( + weight_quantizer.amax.item() / weight_quantizer.maxbound + ) + else: + # Per-channel amax + weight_scaling_factor = torch.tensor( + weight_quantizer.amax / weight_quantizer.maxbound + ) + + sub_module.register_buffer( + quantizer_attrs.weight_scale, + weight_scaling_factor, ) - else: - # Per-channel amax - weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound) sub_module.register_buffer( quantizer_attrs.weight_scale, weight_scaling_factor, ) - if hasattr(input_quantizer, "_amax") or (hasattr(input_quantizer, "amax") and input_quantizer.amax is not None): + if hasattr(input_quantizer, "_amax") or ( + hasattr(input_quantizer, "amax") and input_quantizer.amax is not None + ): assert input_quantizer is not None if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: input_quantizer._amax = input_quantizer._amax.to(torch.float32) @@ -450,7 +459,9 @@ def _export_quantized_weight( ).squeeze(), ) - if hasattr(output_quantizer, "_amax") or (hasattr(output_quantizer, "amax") and output_quantizer.amax is not None): + if hasattr(output_quantizer, "_amax") or ( + hasattr(output_quantizer, "amax") and output_quantizer.amax is not None + ): assert output_quantizer is not None if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: output_quantizer._amax = output_quantizer._amax.to(torch.float32) @@ -506,9 +517,14 @@ def _export_quantized_weight( # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module # This can happen for modules that were disabled from quantization or have invalid calibration data - if weight_scale is None and quantization_format not in [QUANTIZATION_NVFP4, QUANTIZATION_NVFP4_AWQ]: + if weight_scale is None and quantization_format not in [ + QUANTIZATION_NVFP4, + QUANTIZATION_NVFP4_AWQ, + ]: # For NVFP4, weight_scale is computed later, so we can't check here - print(f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found") + print( + f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found" + ) return # Transpose weight for bmm-style expert quantization (llama4, gpt-oss) From 2f2203cbe24ad9497df7976d97699553c3ddfa60 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 14 Jan 2026 14:38:52 -0800 Subject: [PATCH 03/11] add support for nemotron parse fp8 Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/unified_export_hf.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 6efcb1e6a..eb33ee8fe 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -446,7 +446,9 @@ def _export_quantized_weight( ) if hasattr(input_quantizer, "_amax") or ( - hasattr(input_quantizer, "amax") and input_quantizer.amax is not None + input_quantizer is not None + and hasattr(input_quantizer, "amax") + and input_quantizer.amax is not None ): assert input_quantizer is not None if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: @@ -460,7 +462,9 @@ def _export_quantized_weight( ) if hasattr(output_quantizer, "_amax") or ( - hasattr(output_quantizer, "amax") and output_quantizer.amax is not None + output_quantizer is not None + and hasattr(output_quantizer, "amax") + and output_quantizer.amax is not None ): assert output_quantizer is not None if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: From 7ea5e0e870242696042b50d39088d7006e0b7586 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 20 Jan 2026 17:42:11 -0800 Subject: [PATCH 04/11] add image-text data calibration support Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 90 +++++++++++++++++++--- examples/llm_ptq/hf_ptq.py | 81 ++++++++++--------- modelopt/torch/export/unified_export_hf.py | 10 +-- 3 files changed, 128 insertions(+), 53 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index cfd218abf..3fb6f1ceb 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -276,9 +276,33 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok if "vila" in ckpt_path.lower(): ckpt_path += "/llm" - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, trust_remote_code=trust_remote_code, **kwargs - ) + # Suppress verbose tokenizer output (e.g., printing all special tokens) + import contextlib + import io + import logging + import os + + # Save current settings + old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None) + transformers_log_level = logging.getLogger("transformers").level + + # Suppress output + os.environ["TOKENIZERS_PARALLELISM"] = "false" + logging.getLogger("transformers").setLevel(logging.ERROR) + + # Also capture stdout to suppress verbose tokenizer printing + with contextlib.redirect_stdout(io.StringIO()): + try: + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, trust_remote_code=trust_remote_code, **kwargs + ) + finally: + # Restore original settings + if old_verbosity is not None: + os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity + else: + os.environ.pop("TOKENIZERS_PARALLELISM", None) + logging.getLogger("transformers").setLevel(transformers_log_level) # can't set attribute 'pad_token' for "" # We skip this step for Nemo models @@ -334,10 +358,23 @@ def get_processor( # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) # This will only work if the model has a processor config try: - processor = AutoProcessor.from_pretrained( - ckpt_path, - **model_kwargs, - ) + import contextlib + import io + import logging + + # Suppress verbose output from processor/tokenizer loading + transformers_log_level = logging.getLogger("transformers").level + logging.getLogger("transformers").setLevel(logging.ERROR) + + with contextlib.redirect_stdout(io.StringIO()): + processor = AutoProcessor.from_pretrained( + ckpt_path, + **model_kwargs, + ) + + # Restore logging + logging.getLogger("transformers").setLevel(transformers_log_level) + print(f"Loaded AutoProcessor for model type: {model_type}") return processor except Exception as e: @@ -476,12 +513,26 @@ def get_model( # Load config once and handle VL model detection try: hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs) + + # Check specifically for Nemotron-Parse + architectures = getattr(hf_config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + if is_nemotron_vl(hf_config): - print( - "Detected Nemotron VL model from config. " - "Disabling automatic device mapping for compatibility." - ) - device_map = None + if is_nemotron_parse: + # Nemotron-Parse works fine with device_map="auto" + # Keep device_map="auto" to ensure proper device placement + print( + "Detected Nemotron-Parse model from config. " + "Using automatic device mapping." + ) + else: + # For other Nemotron VL models, disable device_map for compatibility + print( + "Detected Nemotron VL model from config. " + "Disabling automatic device mapping for compatibility." + ) + device_map = None except Exception as e: print(f"Error: Could not load config from {ckpt_path}: {e}") raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e @@ -590,6 +641,21 @@ def get_model( print(f"Moving model to {device} device...") model = model.to(device) + # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device + # The RADIO encoder has buffers that might not be properly moved even with device_map="auto" + # This is because custom RADIO modules might not fully support accelerate's device_map + if device != "cpu" and hasattr(model, "encoder"): + # Check if encoder has any buffers on CPU + cpu_buffers = [] + for name, buffer in model.encoder.named_buffers(): + if buffer.device.type == "cpu": + cpu_buffers.append(name) + + if cpu_buffers: + print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...") + model.encoder = model.encoder.to(device) + print(f"Encoder moved to {device}") + if device == "cuda" and not is_model_on_gpu(model): print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM") diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index b4be53ecf..ad552ee43 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -66,6 +66,7 @@ ) from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor from modelopt.torch.utils.memory_monitor import launch_memory_monitor +from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader @@ -141,6 +142,7 @@ def make_calib_dataloader( tokenizer: PreTrainedTokenizerBase | None, device: torch.device, model_type: str | None, + full_model: torch.nn.Module | None = None, ) -> tuple[DataLoader, str | None]: calib_dataloader = None first_text_speech_dataset = None @@ -402,18 +404,35 @@ def load_model(args: argparse.Namespace): language_model = extracted_lm model_type = extracted_model_type else: + # Check if this is a Nemotron VL model that needs a processor + # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse + is_nemotron_vl_model = is_nemotron_vl(full_model) + + # Check specifically for Nemotron-Parse to set appropriate dataset defaults + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + if args.dataset is None: - args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] - warnings.warn( - "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." - ) + if is_nemotron_parse: + # For Nemotron-Parse, default to Nemotron VLM Dataset v2 + args.dataset = ["nemotron_vlm_v2"] + print( + "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse " + "(NVIDIA's image-text dataset for better calibration)." + ) + else: + # For other models, use text-only datasets + args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + warnings.warn( + "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." + ) + # Adjust calib_size to match dataset length by extending or truncating as needed args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[ : len(args.dataset) ] - # Check if this is a Nemotron VL model that needs a processor - is_nemotron_vl_model = is_nemotron_vl(full_model) if is_nemotron_vl_model: # Load processor for Nemotron VL models (like Nemotron-Parse) processor = get_processor( @@ -506,14 +525,23 @@ def mono_quantize( "Consider reducing calib_size to reduce calibration time.\n####\n" ) + # Check if this is Nemotron-Parse + config = full_model.config + architectures = getattr(config, "architectures", []) + is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + original_forward = None # Track original forward method if we wrap it + # For Nemotron VL models, disable quantization of vision components if is_nemotron_vl_model: print("Disabling quantization for vision components in Nemotron VL model") quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} quant_cfg["quant_cfg"]["*image*"] = {"enable": False} - # Also disable radio model components specifically + # Also disable radio model components specifically (for Nemotron-Parse) quant_cfg["quant_cfg"]["*radio*"] = {"enable": False} quant_cfg["quant_cfg"]["*visual*"] = {"enable": False} + quant_cfg["quant_cfg"]["*encoder*"] = {"enable": False} # Disable encoder + quant_cfg["quant_cfg"]["*model_encoder*"] = {"enable": False} # Nemotron-Parse specific + print("Quantization will only be applied to the decoder (text generation) component") if not model_is_already_quantized or calibration_only: if model_type == "gptoss" and args.qformat == "nvfp4_mlp_only": @@ -541,8 +569,15 @@ def mono_quantize( else: language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop) - # For VL models, update full_model to use the quantized language model - if is_nemotron_vl_model: + # Restore original forward method if we wrapped it for Nemotron-Parse + if is_nemotron_parse and original_forward is not None: + print("Restoring original forward method after calibration") + language_model.forward = original_forward + original_forward = None + + # For VL models (except Nemotron-Parse), update full_model to use the quantized language model + # For Nemotron-Parse, language_model IS full_model, so no update needed + if is_nemotron_vl_model and language_model is not full_model: language_model_lineage = get_language_model_from_vl(full_model) if language_model_lineage is not None: print("Updating full_model with quantized language_model...") @@ -866,38 +901,12 @@ def quantize_main( print(f"Use calib batch_size {args.batch_size}") calib_dataloader, first_text_speech_dataset = make_calib_dataloader( - args, language_model, processor, tokenizer, device, model_type + args, language_model, processor, tokenizer, device, model_type, full_model ) # Detect if this is a Nemotron VL model using architecture-based detection is_nemotron_vl_model = is_nemotron_vl(full_model) - # For Nemotron-Parse, wrap the text-only dataloader to add dummy images - # Nemotron-Parse is an encoder-decoder model that requires pixel_values - if is_nemotron_vl_model and processor is not None: - config = full_model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - - if is_nemotron_parse: - # Check if we're quantizing just the decoder or the full model - decoder_only = language_model is not full_model - - if decoder_only: - print( - "Calibration will use text-only inputs for Nemotron-Parse decoder. " - "Vision encoder is excluded from quantization." - ) - else: - print( - "Wrapping calibration dataloader for Nemotron-Parse to add dummy images. " - "Nemotron-Parse requires pixel_values for full model calibration." - ) - - calib_dataloader = create_nemotron_parse_calib_wrapper( - calib_dataloader, processor, device, decoder_only=decoder_only - ) - preview_input_ids, generated_ids_before_ptq = pre_quantize( args, full_model, model_type, tokenizer, calib_dataloader, is_nemotron_vl_model ) diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index eb33ee8fe..424c5f8e9 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -330,11 +330,11 @@ def llm_dummy_forward(): print( f"Running optimization on language model with fake_input shape: {fake_input.shape}" ) - # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors - if is_nemotron_parse: - language_model(fake_input, use_cache=False) - else: - language_model(fake_input) + # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors + if is_nemotron_parse: + language_model(fake_input, use_cache=False) + else: + language_model(fake_input) else: raise ValueError( f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " From d1963559a05813d3dafec92f0269b82a4011b33d Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 20 Jan 2026 17:42:46 -0800 Subject: [PATCH 05/11] add image-text data calibration support Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 3fb6f1ceb..23ff97f56 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -522,10 +522,7 @@ def get_model( if is_nemotron_parse: # Nemotron-Parse works fine with device_map="auto" # Keep device_map="auto" to ensure proper device placement - print( - "Detected Nemotron-Parse model from config. " - "Using automatic device mapping." - ) + print("Detected Nemotron-Parse model from config. Using automatic device mapping.") else: # For other Nemotron VL models, disable device_map for compatibility print( From dc1af904cd87e98d90efbfb1e69abaf2bd5d8109 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Mon, 9 Feb 2026 22:28:42 -0800 Subject: [PATCH 06/11] fix issues caused by rebase and simplify Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/hf_ptq.py | 37 +++------------ modelopt/torch/export/unified_export_hf.py | 53 ++++++++++------------ 2 files changed, 30 insertions(+), 60 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index ad552ee43..0912c5d83 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -66,7 +66,6 @@ ) from modelopt.torch.utils.image_processor import BaseImageProcessor, MllamaImageProcessor from modelopt.torch.utils.memory_monitor import launch_memory_monitor -from modelopt.torch.utils.nemotron_vlm_dataset_utils import get_nemotron_vlm_dataset_dataloader from modelopt.torch.utils.speech_dataset_utils import get_speech_dataset_dataloader from modelopt.torch.utils.vlm_dataset_utils import get_vlm_dataset_dataloader @@ -142,7 +141,6 @@ def make_calib_dataloader( tokenizer: PreTrainedTokenizerBase | None, device: torch.device, model_type: str | None, - full_model: torch.nn.Module | None = None, ) -> tuple[DataLoader, str | None]: calib_dataloader = None first_text_speech_dataset = None @@ -525,12 +523,6 @@ def mono_quantize( "Consider reducing calib_size to reduce calibration time.\n####\n" ) - # Check if this is Nemotron-Parse - config = full_model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - original_forward = None # Track original forward method if we wrap it - # For Nemotron VL models, disable quantization of vision components if is_nemotron_vl_model: print("Disabling quantization for vision components in Nemotron VL model") @@ -569,15 +561,8 @@ def mono_quantize( else: language_model = mtq.quantize(language_model, quant_cfg, forward_loop=calibrate_loop) - # Restore original forward method if we wrapped it for Nemotron-Parse - if is_nemotron_parse and original_forward is not None: - print("Restoring original forward method after calibration") - language_model.forward = original_forward - original_forward = None - - # For VL models (except Nemotron-Parse), update full_model to use the quantized language model - # For Nemotron-Parse, language_model IS full_model, so no update needed - if is_nemotron_vl_model and language_model is not full_model: + # For VL models, update full_model to use the quantized language model + if is_nemotron_vl_model: language_model_lineage = get_language_model_from_vl(full_model) if language_model_lineage is not None: print("Updating full_model with quantized language_model...") @@ -717,20 +702,10 @@ def pre_quantize( post-quantize generation. """ - # Check if this is Nemotron-Parse (encoder-decoder model) - config = full_model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - # Only run single sample for preview - # For Nemotron-Parse, use decoder_input_ids instead of input_ids - sample_batch = next(iter(calib_dataloader)) - if is_nemotron_parse and "decoder_input_ids" in sample_batch: - preview_input_ids = sample_batch["decoder_input_ids"][0:1] - elif model_type == "whisper": - preview_input_ids = sample_batch["input_features"][0:1] - else: - preview_input_ids = sample_batch["input_ids"][0:1] + preview_input_ids = next(iter(calib_dataloader))[ + "input_features" if model_type == "whisper" else "input_ids" + ][0:1] # Generate preview before quantization if model_type == "deepseek": @@ -901,7 +876,7 @@ def quantize_main( print(f"Use calib batch_size {args.batch_size}") calib_dataloader, first_text_speech_dataset = make_calib_dataloader( - args, language_model, processor, tokenizer, device, model_type, full_model + args, language_model, processor, tokenizer, device, model_type ) # Detect if this is a Nemotron VL model using architecture-based detection diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 424c5f8e9..878970dd3 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -148,13 +148,13 @@ def _collect_shared_input_modules( def _input_hook(module, input, output): """Update dictionary with list of all modules that share the same input.""" if len(input) > 0 and isinstance(input[0], torch.Tensor): - # TODO: Handle DBRX MoE case - input_to_linear[input[0]].append(module) + # TODO: Handle DBRX MoE case + input_to_linear[input[0]].append(module) def _output_hook(module, input, output): """Update dictionary with mapping of layernorms and their outputs.""" if output_to_layernorm is not None and isinstance(output, torch.Tensor): - output_to_layernorm[output] = module + output_to_layernorm[output] = module handles = [] @@ -323,29 +323,29 @@ def llm_dummy_forward(): if is_vl_model and ("nemotron" in model_type or is_nemotron_parse): # For Nemotron VL models (including Nemotron-Parse), run optimization on just the # language model/decoder. This avoids needing pixel_values for the vision encoder. - language_model_lineage = get_language_model_from_vl(model) + language_model_lineage = get_language_model_from_vl(model) - if language_model_lineage is not None: - language_model = language_model_lineage[-1] - print( - f"Running optimization on language model with fake_input shape: {fake_input.shape}" - ) - # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors - if is_nemotron_parse: - language_model(fake_input, use_cache=False) - else: - language_model(fake_input) + if language_model_lineage is not None: + language_model = language_model_lineage[-1] + print( + f"Running optimization on language model with fake_input shape: {fake_input.shape}" + ) + # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors + if is_nemotron_parse: + language_model(fake_input, use_cache=False) else: - raise ValueError( - f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " - "This is required for requantization/resmoothing optimization. " - "Please ensure the model architecture is supported or file an issue." - ) + language_model(fake_input) + else: + raise ValueError( + f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " + "This is required for requantization/resmoothing optimization. " + "Please ensure the model architecture is supported or file an issue." + ) elif getattr(model.config, "is_encoder_decoder", False): # For other encoder-decoder models (non-VL), pass both encoder and decoder input ids model(fake_input, decoder_input_ids=decoder_fake_input) - else: - model(fake_input) + else: + model(fake_input) input_to_linear, output_to_layernorm = _collect_shared_input_modules( model, llm_dummy_forward, collect_layernorms=True @@ -440,11 +440,6 @@ def _export_quantized_weight( weight_scaling_factor, ) - sub_module.register_buffer( - quantizer_attrs.weight_scale, - weight_scaling_factor, - ) - if hasattr(input_quantizer, "_amax") or ( input_quantizer is not None and hasattr(input_quantizer, "amax") @@ -452,7 +447,7 @@ def _export_quantized_weight( ): assert input_quantizer is not None if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: - input_quantizer._amax = input_quantizer._amax.to(torch.float32) + input_quantizer._amax = input_quantizer._amax.to(torch.float32) sub_module.register_buffer( quantizer_attrs.input_scale, @@ -468,7 +463,7 @@ def _export_quantized_weight( ): assert output_quantizer is not None if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: - output_quantizer._amax = output_quantizer._amax.to(torch.float32) + output_quantizer._amax = output_quantizer._amax.to(torch.float32) else: # Register weight_scale and input_scale if quantization_format == QUANTIZATION_FP8_PB_REAL: @@ -485,7 +480,7 @@ def _export_quantized_weight( ) sub_module.register_buffer(quantizer_attrs.weight_scale, e8m0_scale) if hasattr(weight_quantizer, "_scale") and weight_quantizer._scale is not None: - del weight_quantizer._scale + del weight_quantizer._scale else: sub_module.register_buffer( quantizer_attrs.weight_scale, get_weight_scaling_factor(sub_module, weight_name) From e0e28cb3887f07d75a484722d29402ab2214c670 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 10 Feb 2026 00:02:05 -0800 Subject: [PATCH 07/11] clean up Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 63 ++++++++++------------------ examples/llm_ptq/hf_ptq.py | 37 ++-------------- examples/llm_ptq/vlm_utils.py | 6 +-- modelopt/torch/export/model_utils.py | 4 +- 4 files changed, 29 insertions(+), 81 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index 23ff97f56..e4e7fe1b9 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -276,33 +276,20 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok if "vila" in ckpt_path.lower(): ckpt_path += "/llm" - # Suppress verbose tokenizer output (e.g., printing all special tokens) - import contextlib - import io - import logging - import os - - # Save current settings - old_verbosity = os.environ.get("TOKENIZERS_PARALLELISM", None) - transformers_log_level = logging.getLogger("transformers").level - - # Suppress output - os.environ["TOKENIZERS_PARALLELISM"] = "false" - logging.getLogger("transformers").setLevel(logging.ERROR) - - # Also capture stdout to suppress verbose tokenizer printing - with contextlib.redirect_stdout(io.StringIO()): - try: + # Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading. + # Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy. + if trust_remote_code: + import contextlib + import io + + with contextlib.redirect_stdout(io.StringIO()): tokenizer = AutoTokenizer.from_pretrained( ckpt_path, trust_remote_code=trust_remote_code, **kwargs ) - finally: - # Restore original settings - if old_verbosity is not None: - os.environ["TOKENIZERS_PARALLELISM"] = old_verbosity - else: - os.environ.pop("TOKENIZERS_PARALLELISM", None) - logging.getLogger("transformers").setLevel(transformers_log_level) + else: + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, trust_remote_code=trust_remote_code, **kwargs + ) # can't set attribute 'pad_token' for "" # We skip this step for Nemo models @@ -355,25 +342,17 @@ def get_processor( return MllamaImageProcessor(processor, device) else: - # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) - # This will only work if the model has a processor config - try: - import contextlib - import io - import logging - - # Suppress verbose output from processor/tokenizer loading - transformers_log_level = logging.getLogger("transformers").level - logging.getLogger("transformers").setLevel(logging.ERROR) - - with contextlib.redirect_stdout(io.StringIO()): - processor = AutoProcessor.from_pretrained( - ckpt_path, - **model_kwargs, - ) + # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse). + # Suppress stdout for trust_remote_code models where custom processor code may be noisy. + import contextlib + import io - # Restore logging - logging.getLogger("transformers").setLevel(transformers_log_level) + try: + if model_kwargs.get("trust_remote_code", False): + with contextlib.redirect_stdout(io.StringIO()): + processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs) + else: + processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs) print(f"Loaded AutoProcessor for model type: {model_type}") return processor diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 0912c5d83..0a414e408 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -402,44 +402,15 @@ def load_model(args: argparse.Namespace): language_model = extracted_lm model_type = extracted_model_type else: - # Check if this is a Nemotron VL model that needs a processor - # Do this BEFORE setting default datasets so we can use image-text data for Nemotron-Parse - is_nemotron_vl_model = is_nemotron_vl(full_model) - - # Check specifically for Nemotron-Parse to set appropriate dataset defaults - config = full_model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - if args.dataset is None: - if is_nemotron_parse: - # For Nemotron-Parse, default to Nemotron VLM Dataset v2 - args.dataset = ["nemotron_vlm_v2"] - print( - "No dataset specified. Defaulting to 'nemotron_vlm_v2' for Nemotron-Parse " - "(NVIDIA's image-text dataset for better calibration)." - ) - else: - # For other models, use text-only datasets - args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] - warnings.warn( - "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." - ) - + args.dataset = ["cnn_dailymail", "nemotron-post-training-dataset-v2"] + warnings.warn( + "No dataset specified. Defaulting to cnn_dailymail and nemotron-post-training-dataset-v2." + ) # Adjust calib_size to match dataset length by extending or truncating as needed args.calib_size = (args.calib_size + [args.calib_size[-1]] * len(args.dataset))[ : len(args.dataset) ] - - if is_nemotron_vl_model: - # Load processor for Nemotron VL models (like Nemotron-Parse) - processor = get_processor( - args.pyt_ckpt_path, - model_type, - device, - trust_remote_code=args.trust_remote_code, - ) - tokenizer = get_tokenizer(args.pyt_ckpt_path, trust_remote_code=args.trust_remote_code) default_padding_side = tokenizer.padding_side diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 2d3d9f82c..529efeb15 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -126,11 +126,7 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): else: processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - # Check if this is Nemotron-Parse (uses task prompts instead of chat templates) - config = model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - + # is_nemotron_parse was already computed above if is_nemotron_parse: # Nemotron-Parse uses a specific task prompt format # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 40c313ad2..5dac1b933 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -147,7 +147,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None: if hasattr(model, "language_model"): return [model, model.language_model] - # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model + # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model. + # Note: This is safe because this function is only called when the model is already detected as a VLM. + # Non-VLM encoder-decoder models (T5, Bart) won't reach this code path. if hasattr(model, "decoder"): return [model, model.decoder] From 3dd8758653cd8edd9844196a555168ce9b4777a9 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Tue, 10 Feb 2026 11:30:04 -0800 Subject: [PATCH 08/11] make image-text calib default for VLMs, further simplify Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/example_utils.py | 144 +++++++-------------- examples/llm_ptq/hf_ptq.py | 8 +- examples/llm_ptq/vlm_utils.py | 92 ++++--------- modelopt/torch/export/unified_export_hf.py | 85 ++++-------- 4 files changed, 103 insertions(+), 226 deletions(-) diff --git a/examples/llm_ptq/example_utils.py b/examples/llm_ptq/example_utils.py index e4e7fe1b9..71755a02f 100755 --- a/examples/llm_ptq/example_utils.py +++ b/examples/llm_ptq/example_utils.py @@ -68,39 +68,26 @@ def run_nemotron_vl_preview( """ from vlm_utils import run_text_only_generation, run_vl_preview_generation - # Check if this is Nemotron-Parse (encoder-decoder model that requires images) - config = full_model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) + print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...") + question = tokenizer.decode(input_ids[0], skip_special_tokens=True) + generation_config = { + "max_new_tokens": 100, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } + + # Try text-only generation (may fail for encoder-decoder models like Nemotron-Parse) + text_response = run_text_only_generation( + full_model, tokenizer, question, generation_config, pyt_ckpt_path + ) generated_ids = None - - if not is_nemotron_parse: - # Only try text-only generation for models that support it (not Nemotron-Parse) - print(f"Running text-only preview generation for Nemotron VL model ({stage_name})...") - question = tokenizer.decode(input_ids[0], skip_special_tokens=True) - generation_config = { - "max_new_tokens": 100, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } - - # Try text-only generation - text_response = run_text_only_generation( - full_model, tokenizer, question, generation_config, pyt_ckpt_path - ) - - if text_response is not None: - print(f"✅ Text-only generation successful: {text_response[:100]}...") - generated_ids = text_response - elif allow_fallback: - print("Text-only generation failed, falling back to standard generate...") - generated_ids = full_model.generate(input_ids, max_new_tokens=100) - else: - print( - f"Skipping text-only generation for Nemotron-Parse ({stage_name}) - " - "this encoder-decoder model requires images for all operations." - ) + if text_response is not None: + print(f"✅ Text-only generation successful: {text_response[:100]}...") + generated_ids = text_response + elif allow_fallback: + print("Text-only generation failed, falling back to standard generate...") + generated_ids = full_model.generate(input_ids, max_new_tokens=100) # Run additional VL test with images print(f"Running additional VL test with images ({stage_name})...") @@ -111,10 +98,6 @@ def run_nemotron_vl_preview( def _is_multimodal_config(config): """Check if a config indicates a multimodal model (config-only version of is_multimodal_model).""" - # Check for Nemotron-Parse encoder-decoder architecture - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - return ( hasattr(config, "vision_config") # Standard vision config (e.g., Qwen2.5-VL) or getattr(config, "model_type", "") == "phi4mm" # Phi-4 multimodal @@ -123,7 +106,10 @@ def _is_multimodal_config(config): or ( hasattr(config, "embd_layer") and hasattr(config.embd_layer, "image_embd_layer") ) # Image embedding layers - or is_nemotron_parse # Nemotron-Parse conditional generation model + or getattr(config, "is_encoder_decoder", False) # Encoder-decoder VL models + or any( # Architecture-based detection for custom VL models (e.g., Nemotron-Parse) + "conditionalgeneration" in arch.lower() for arch in getattr(config, "architectures", []) + ) ) @@ -176,9 +162,20 @@ def calibrate_loop(_model): ) allowed_keys = set(forward_params.keys()) + # Check if model is encoder-decoder (needs decoder_input_ids instead of input_ids) + is_enc_dec = getattr(full_model.config, "is_encoder_decoder", False) + full_model.eval() with torch.no_grad(): for batch in calib_dataloader: + # For encoder-decoder models, rename input_ids → decoder_input_ids + # and disable KV caching to avoid tuple index errors in decoder layers + if is_enc_dec and "input_ids" in batch and "pixel_values" in batch: + batch["decoder_input_ids"] = batch.pop("input_ids") + if "attention_mask" in batch: + batch["decoder_attention_mask"] = batch.pop("attention_mask") + batch["use_cache"] = False + # Filter batch to only include parameters the model accepts if accepts_kwargs: call_kwargs = batch @@ -190,10 +187,8 @@ def calibrate_loop(_model): # Use safe_nemotron_vl_forward for Nemotron Nano VL (embedding-injection style) # For other VLMs (like Nemotron-Parse), use standard forward if hasattr(full_model, "img_context_token_id"): - # Nemotron Nano VL style safe_nemotron_vl_forward(full_model, call_kwargs) else: - # Standard encoder-decoder or other VLM architectures full_model(**call_kwargs) return calibrate_loop @@ -276,20 +271,9 @@ def get_tokenizer(ckpt_path, trust_remote_code=False, **kwargs) -> PreTrainedTok if "vila" in ckpt_path.lower(): ckpt_path += "/llm" - # Some custom tokenizers (e.g., Nemotron-Parse) print verbose output when loading. - # Only suppress stdout for trust_remote_code models where custom tokenizer code may be noisy. - if trust_remote_code: - import contextlib - import io - - with contextlib.redirect_stdout(io.StringIO()): - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, trust_remote_code=trust_remote_code, **kwargs - ) - else: - tokenizer = AutoTokenizer.from_pretrained( - ckpt_path, trust_remote_code=trust_remote_code, **kwargs - ) + tokenizer = AutoTokenizer.from_pretrained( + ckpt_path, trust_remote_code=trust_remote_code, **kwargs + ) # can't set attribute 'pad_token' for "" # We skip this step for Nemo models @@ -342,18 +326,9 @@ def get_processor( return MllamaImageProcessor(processor, device) else: - # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse). - # Suppress stdout for trust_remote_code models where custom processor code may be noisy. - import contextlib - import io - + # Try to load AutoProcessor for other VL models (e.g., Nemotron-Parse) try: - if model_kwargs.get("trust_remote_code", False): - with contextlib.redirect_stdout(io.StringIO()): - processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs) - else: - processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs) - + processor = AutoProcessor.from_pretrained(ckpt_path, **model_kwargs) print(f"Loaded AutoProcessor for model type: {model_type}") return processor except Exception as e: @@ -493,22 +468,12 @@ def get_model( try: hf_config = AutoConfig.from_pretrained(ckpt_path, **config_kwargs) - # Check specifically for Nemotron-Parse - architectures = getattr(hf_config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - if is_nemotron_vl(hf_config): - if is_nemotron_parse: - # Nemotron-Parse works fine with device_map="auto" - # Keep device_map="auto" to ensure proper device placement - print("Detected Nemotron-Parse model from config. Using automatic device mapping.") - else: - # For other Nemotron VL models, disable device_map for compatibility - print( - "Detected Nemotron VL model from config. " - "Disabling automatic device mapping for compatibility." - ) - device_map = None + print( + "Detected Nemotron VL model from config. " + "Disabling automatic device mapping for compatibility." + ) + device_map = None except Exception as e: print(f"Error: Could not load config from {ckpt_path}: {e}") raise RuntimeError(f"Failed to load model configuration from {ckpt_path}") from e @@ -564,13 +529,17 @@ def get_model( if not hasattr(transformers, architecture): warnings.warn( f"Architecture {architecture} not found in transformers: {transformers.__version__}. " - "Falling back to AutoModel." + "Falling back to AutoModelForCausalLM (or AutoModel for non-causal architectures)." ) assert trust_remote_code, ( "Please set trust_remote_code to True if you want to use this architecture" ) - auto_model_module = AutoModel + # Use AutoModelForCausalLM for causal LMs, AutoModel for encoder-decoder models + if getattr(hf_config, "is_encoder_decoder", False): + auto_model_module = AutoModel + else: + auto_model_module = AutoModelForCausalLM from_config = auto_model_module.from_config else: auto_model_module = getattr(transformers, architecture) @@ -617,21 +586,6 @@ def get_model( print(f"Moving model to {device} device...") model = model.to(device) - # For Nemotron-Parse, ensure the encoder (including RADIO) is fully on device - # The RADIO encoder has buffers that might not be properly moved even with device_map="auto" - # This is because custom RADIO modules might not fully support accelerate's device_map - if device != "cpu" and hasattr(model, "encoder"): - # Check if encoder has any buffers on CPU - cpu_buffers = [] - for name, buffer in model.encoder.named_buffers(): - if buffer.device.type == "cpu": - cpu_buffers.append(name) - - if cpu_buffers: - print(f"Found {len(cpu_buffers)} encoder buffers on CPU. Moving encoder to {device}...") - model.encoder = model.encoder.to(device) - print(f"Encoder moved to {device}") - if device == "cuda" and not is_model_on_gpu(model): print("Warning: Some parameters are not on a GPU. Calibration can be slow or hit OOM") diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 0a414e408..664dd04f0 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -361,6 +361,12 @@ def load_model(args: argparse.Namespace): default_pad_token = None is_nemotron_vl_model = is_nemotron_vl(full_model) + + # Default to image-text calibration for VLM models + if is_nemotron_vl_model and not args.calib_with_images: + print("Nemotron VL model detected. Enabling image-text calibration by default.") + args.calib_with_images = True + if model_type == "mllama": processor = get_processor( args.pyt_ckpt_path, @@ -689,7 +695,7 @@ def pre_quantize( preview_input_ids, args.pyt_ckpt_path, "before quantization", - allow_fallback=True, + allow_fallback=False, ) else: # Standard generation for non-Nemotron VL models diff --git a/examples/llm_ptq/vlm_utils.py b/examples/llm_ptq/vlm_utils.py index 529efeb15..9919e405b 100644 --- a/examples/llm_ptq/vlm_utils.py +++ b/examples/llm_ptq/vlm_utils.py @@ -18,7 +18,7 @@ import os from PIL import Image -from transformers import AutoImageProcessor, AutoProcessor, GenerationConfig +from transformers import AutoImageProcessor, AutoProcessor def run_vl_preview_generation(model, tokenizer, model_path, stage_name): @@ -73,34 +73,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): print(" Skipping VL preview generation.") return None - # Check if this is Nemotron-Parse early to set up proper generation config - config = model.config - architectures = getattr(config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - # Generate response question = "Describe this image briefly." # Updated for single image - - # Use model's GenerationConfig for Nemotron-Parse, dict for others - if is_nemotron_parse: - try: - generation_config = GenerationConfig.from_pretrained( - model_path, trust_remote_code=True - ) - print("Using Nemotron-Parse GenerationConfig from model") - except Exception as e: - print(f"Warning: Could not load GenerationConfig: {e}, using defaults") - generation_config = { - "max_new_tokens": 50, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } - else: - generation_config = { - "max_new_tokens": 50, - "do_sample": False, - "eos_token_id": tokenizer.eos_token_id, - } + generation_config = { + "max_new_tokens": 50, + "do_sample": False, + "eos_token_id": tokenizer.eos_token_id, + } print(f"Generating VL response ({stage_name})...") @@ -126,14 +105,8 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): else: processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True) - # is_nemotron_parse was already computed above - if is_nemotron_parse: - # Nemotron-Parse uses a specific task prompt format - # See: https://huggingface.co/nvidia/NVIDIA-Nemotron-Parse-v1.1#usage-example - prompt = "" - print(f"Using Nemotron-Parse task prompt: {prompt}") - else: - # Other VL models use chat templates + # Use chat template if available, otherwise fall back to default task prompt + if hasattr(tokenizer, "chat_template") and tokenizer.chat_template is not None: messages = [ {"role": "system", "content": "/no_think"}, { @@ -150,11 +123,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): ], }, ] - - # Apply chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) + else: + # For models without chat templates (e.g., encoder-decoder VL models), + # use the tokenizer's bos/eos tokens as a minimal prompt + prompt = (tokenizer.bos_token or "") + question # Process inputs using the processor with single image inputs = processor( @@ -175,22 +150,12 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): ) # Generate response using model.generate - if isinstance(generation_config, GenerationConfig): - # For Nemotron-Parse with GenerationConfig object - generated_ids = model.generate( - pixel_values=inputs.pixel_values, - input_ids=inputs.input_ids, - attention_mask=inputs.attention_mask, - generation_config=generation_config, - ) - else: - # For other models with dict generation config - generated_ids = model.generate( - pixel_values=inputs.pixel_values, - input_ids=inputs.input_ids, - attention_mask=inputs.attention_mask, - **generation_config, - ) + generated_ids = model.generate( + pixel_values=inputs.pixel_values, + input_ids=inputs.input_ids, + attention_mask=inputs.attention_mask, + **generation_config, + ) # Decode the response (trim input tokens like in the working example) if generated_ids is None: @@ -199,20 +164,13 @@ def run_vl_preview_generation(model, tokenizer, model_path, stage_name): generated_ids_trimmed = [ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids) ] - - # For Nemotron-Parse, use tokenizer.batch_decode instead of processor.batch_decode - if is_nemotron_parse and hasattr(tokenizer, "batch_decode"): - output_text = tokenizer.batch_decode( - generated_ids_trimmed, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) - else: - output_text = processor.batch_decode( - generated_ids_trimmed, - skip_special_tokens=True, - clean_up_tokenization_spaces=False, - ) + # Use processor.batch_decode if available, otherwise fall back to tokenizer + decoder = processor if hasattr(processor, "batch_decode") else tokenizer + output_text = decoder.batch_decode( + generated_ids_trimmed, + skip_special_tokens=True, + clean_up_tokenization_spaces=False, + ) if output_text is None or len(output_text) == 0: raise ValueError("Decoding returned empty output") diff --git a/modelopt/torch/export/unified_export_hf.py b/modelopt/torch/export/unified_export_hf.py index 878970dd3..b6b92f6ff 100644 --- a/modelopt/torch/export/unified_export_hf.py +++ b/modelopt/torch/export/unified_export_hf.py @@ -316,13 +316,9 @@ def llm_dummy_forward(): [1, model.config.num_mel_bins, feature_extractor.nb_max_frames], dtype=model.dtype ).to(model.device) - # Check if this is Nemotron-Parse (encoder-decoder VL model) - architectures = getattr(model.config, "architectures", []) - is_nemotron_parse = any("nemotronparse" in arch.lower() for arch in architectures) - - if is_vl_model and ("nemotron" in model_type or is_nemotron_parse): - # For Nemotron VL models (including Nemotron-Parse), run optimization on just the - # language model/decoder. This avoids needing pixel_values for the vision encoder. + if is_vl_model and "nemotron" in model_type: + # For Nemotron VL models, run optimization on just the language model/decoder. + # This avoids needing pixel_values for the vision encoder. language_model_lineage = get_language_model_from_vl(model) if language_model_lineage is not None: @@ -330,11 +326,8 @@ def llm_dummy_forward(): print( f"Running optimization on language model with fake_input shape: {fake_input.shape}" ) - # For Nemotron-Parse decoder, force use_cache=False to avoid tuple index errors - if is_nemotron_parse: - language_model(fake_input, use_cache=False) - else: - language_model(fake_input) + # Pass use_cache=False to avoid KV cache issues in encoder-decoder models + language_model(fake_input, use_cache=False) else: raise ValueError( f"Cannot extract language_model from Nemotron VL model (type: {model_type}). " @@ -412,42 +405,25 @@ def _export_quantized_weight( if quantization_format == QUANTIZATION_FP8: # Convert amax to float32 - # Note: Use the public 'amax' property, not the private '_amax' attribute - if hasattr(weight_quantizer, "_amax") and weight_quantizer._amax is not None: - weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) - amax_tensor = weight_quantizer._amax - else: - # Fallback to public amax property - amax_tensor = weight_quantizer.amax - if amax_tensor is not None and hasattr(amax_tensor, "to"): - amax_tensor = amax_tensor.to(torch.float32) - - # Only compute scaling factor if amax_tensor is valid - if amax_tensor is not None and hasattr(amax_tensor, "dim"): - if amax_tensor.dim() == 1: - # Per-tensor amax - weight_scaling_factor = torch.tensor( - weight_quantizer.amax.item() / weight_quantizer.maxbound - ) - else: - # Per-channel amax - weight_scaling_factor = torch.tensor( - weight_quantizer.amax / weight_quantizer.maxbound - ) + weight_quantizer._amax = weight_quantizer._amax.to(torch.float32) - sub_module.register_buffer( - quantizer_attrs.weight_scale, - weight_scaling_factor, + if weight_quantizer._amax.dim() == 1: + # Per-tensor amax + weight_scaling_factor = torch.tensor( + weight_quantizer.amax.item() / weight_quantizer.maxbound ) + else: + # Per-channel amax + weight_scaling_factor = torch.tensor(weight_quantizer.amax / weight_quantizer.maxbound) - if hasattr(input_quantizer, "_amax") or ( - input_quantizer is not None - and hasattr(input_quantizer, "amax") - and input_quantizer.amax is not None - ): + sub_module.register_buffer( + quantizer_attrs.weight_scale, + weight_scaling_factor, + ) + + if hasattr(input_quantizer, "_amax"): assert input_quantizer is not None - if hasattr(input_quantizer, "_amax") and input_quantizer._amax is not None: - input_quantizer._amax = input_quantizer._amax.to(torch.float32) + input_quantizer._amax = input_quantizer._amax.to(torch.float32) sub_module.register_buffer( quantizer_attrs.input_scale, @@ -456,14 +432,9 @@ def _export_quantized_weight( ).squeeze(), ) - if hasattr(output_quantizer, "_amax") or ( - output_quantizer is not None - and hasattr(output_quantizer, "amax") - and output_quantizer.amax is not None - ): + if hasattr(output_quantizer, "_amax"): assert output_quantizer is not None - if hasattr(output_quantizer, "_amax") and output_quantizer._amax is not None: - output_quantizer._amax = output_quantizer._amax.to(torch.float32) + output_quantizer._amax = output_quantizer._amax.to(torch.float32) else: # Register weight_scale and input_scale if quantization_format == QUANTIZATION_FP8_PB_REAL: @@ -514,18 +485,6 @@ def _export_quantized_weight( weight_scale: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale, None) weight_scale_2: torch.Tensor | None = getattr(sub_module, quantizer_attrs.weight_scale_2, None) - # If weight_scale is None (e.g., quantizer wasn't calibrated), skip quantization for this module - # This can happen for modules that were disabled from quantization or have invalid calibration data - if weight_scale is None and quantization_format not in [ - QUANTIZATION_NVFP4, - QUANTIZATION_NVFP4_AWQ, - ]: - # For NVFP4, weight_scale is computed later, so we can't check here - print( - f"Warning: Skipping quantization for {type(sub_module).__name__} - no weight_scale found" - ) - return - # Transpose weight for bmm-style expert quantization (llama4, gpt-oss) # Check if this is a BMM-style expert weight that needs transposition is_bmm_expert_weight = weight.dim() == 3 and any( From e94fbc17d639f65d7dc875bb5c4cedc47354e2ea Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 11 Feb 2026 14:40:58 -0800 Subject: [PATCH 09/11] use batch_size = 1 for calib_with_images Signed-off-by: Zhiyu Cheng --- examples/llm_ptq/hf_ptq.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/examples/llm_ptq/hf_ptq.py b/examples/llm_ptq/hf_ptq.py index 664dd04f0..de434e1cf 100755 --- a/examples/llm_ptq/hf_ptq.py +++ b/examples/llm_ptq/hf_ptq.py @@ -809,14 +809,10 @@ def quantize_main( device: torch.device, ): if args.batch_size == 0: - # Check if this is a vision-language model - # For VL models, skip automatic batch size detection and use a conservative default - # since proper multimodal input preparation is complex - if is_multimodal_model(full_model) or is_nemotron_vl(full_model): - print( - "Vision-language model detected. Using default batch_size=1 for calibration " - "to ensure proper handling of multimodal inputs." - ) + # For VL models with image-text calibration, skip automatic batch size detection + # since get_max_batch_size can't handle multimodal inputs + if args.calib_with_images: + print("Image-text calibration enabled. Using default batch_size=1 for calibration.") args.batch_size = 1 else: # Calibration/sparsification will actually take much more memory than regular inference From 0666b5590e3829c29614cb2fc18f12bb3486ddc2 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Wed, 11 Feb 2026 23:23:13 -0800 Subject: [PATCH 10/11] fix ci Signed-off-by: Zhiyu Cheng --- modelopt/torch/export/model_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/modelopt/torch/export/model_utils.py b/modelopt/torch/export/model_utils.py index 5dac1b933..6cb5be9a5 100755 --- a/modelopt/torch/export/model_utils.py +++ b/modelopt/torch/export/model_utils.py @@ -148,9 +148,9 @@ def get_language_model_from_vl(model) -> list[nn.Module] | None: return [model, model.language_model] # Pattern 3: For encoder-decoder VL models (e.g., Nemotron-Parse), the decoder is the language model. - # Note: This is safe because this function is only called when the model is already detected as a VLM. - # Non-VLM encoder-decoder models (T5, Bart) won't reach this code path. - if hasattr(model, "decoder"): + # Only match if the model is detected as multimodal to avoid matching non-VLM encoder-decoder + # models like T5, Bart, Whisper which also have .decoder. + if hasattr(model, "decoder") and is_multimodal_model(model): return [model, model.decoder] # Pattern 4: No language_model found From eef7a786d4fbd7c918b07fe548804e5ab6bcdb75 Mon Sep 17 00:00:00 2001 From: Zhiyu Cheng Date: Thu, 12 Feb 2026 16:18:40 -0800 Subject: [PATCH 11/11] update changelog Signed-off-by: Zhiyu Cheng --- CHANGELOG.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 9d7500e58..bbbe6ab9e 100755 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -21,6 +21,7 @@ NVIDIA Model Optimizer Changelog (Linux) - Add LTX-2 and Wan2.2 (T2V) support in the diffusers quantization workflow. - Add PTQ support for GLM-4.7, including loading MTP layer weights from a separate ``mtp.safetensors`` file and export as-is. - Add support for image-text data calibration in PTQ for Nemotron VL models. +- Add PTQ support for Nemotron Parse. 0.41 (2026-01-19) ^^^^^^^^^^^^^^^^^