-
Couldn't load subscription status.
- Fork 183
[OMNIML-2182]: Add example for multinode calibration using FSDP2 #432
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
1963c1a
5789e7e
c7709a5
ebc44cd
99055cc
9bef2d4
1860a55
3908fa3
0e32428
4fa0744
6dc18f4
f33678e
add1aa1
8770810
0e3bb9f
bb6feb5
a7afbca
b82f95b
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -235,6 +235,38 @@ with init_quantized_weights(mtq.NVFP4_DEFAULT_CFG): | |
| mtq.calibrate(model, algorithm="max", forward_loop=calibrate_loop) | ||
| ``` | ||
|
|
||
| ## Multi-Node Post-Training Quantization with FSDP2 | ||
|
|
||
| ModelOpt enables quantization of LLMs across multiple GPU nodes using various quantization formats. It leverages HuggingFace's Accelerate library and FSDP2 for distributed model sharding and calibration. | ||
|
|
||
| ### Usage | ||
|
|
||
| For distributed execution across multiple nodes, use the `accelerate` library. A template configuration file (`fsdp2.yaml`) is provided and can be customized for user specific requirements. | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Fix hyphenation for compound modifier. The phrase "user specific requirements" should use a hyphen when the compound modifier precedes the noun. Apply this diff: -For distributed execution across multiple nodes, use the `accelerate` library. A template configuration file (`fsdp2.yaml`) is provided and can be customized for user specific requirements.
+For distributed execution across multiple nodes, use the `accelerate` library. A template configuration file (`fsdp2.yaml`) is provided and can be customized for user-specific requirements.🧰 Tools🪛 LanguageTool[grammar] ~244-~244: Use a hyphen to join words. (QB_NEW_EN_HYPHEN) 🤖 Prompt for AI Agents |
||
|
|
||
| On each node run the following command: | ||
|
|
||
| ```bash | ||
| accelerate launch --config_file fsdp2.yaml \ | ||
| --num_machines=<num_nodes> \ | ||
| --machine_rank=<current_node_rank> \ | ||
| --main_process_ip=<node0_ip_addr> \ | ||
| --main_process_port=<port> \ | ||
| --fsdp_transformer_layer_cls_to_wrap=<decoder_layer_name> | ||
| multinode_ptq.py \ | ||
| --pyt_ckpt_path <path_to_model> \ | ||
| --qformat <fp8/nvfp4/nvfp4_awq/int8> \ | ||
| --kv_cache_qformat <fp8/nvfp4/nvfp4_affine/none> \ | ||
| --batch_size <calib_batch_size> \ | ||
| --calib_size <num_calib_samples> \ | ||
| --dataset <dataset> \ | ||
| --export_path <export_path> \ | ||
| --trust_remote_code | ||
| ``` | ||
|
|
||
| The exported checkpoint can be deployed using TensorRT-LLM/ vLLM/ SGLang. For more details refer to the [deployment section](#deployment) of this document. | ||
|
|
||
| > *Performance Note: FSDP2 is designed for training workloads and may result in longer calibration and export times. For faster calibration, maximize the batch size based on available GPU memory and choose the right number of GPUs to avoid unnecessary communication.* | ||
| > | ||
| ## Framework Scripts | ||
|
|
||
| ### Hugging Face Example [Script](./scripts/huggingface_example.sh) | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,6 +13,7 @@ | |
| # See the License for the specific language governing permissions and | ||
| # limitations under the License. | ||
|
|
||
| import copy | ||
| import glob | ||
| import os | ||
| import shutil | ||
|
|
@@ -32,11 +33,66 @@ | |
| except ImportError: | ||
| snapshot_download = None | ||
|
|
||
| import modelopt.torch.quantization as mtq | ||
| from modelopt.torch.utils.image_processor import MllamaImageProcessor | ||
|
|
||
| SPECULATIVE_MODEL_LIST = ["Eagle", "Medusa"] | ||
|
|
||
|
|
||
| def build_quant_cfg( | ||
| qformat, | ||
| kv_cache_qformat, | ||
| awq_block_size, | ||
| auto_quantize, | ||
| model_type, | ||
| quant_cfg_choices, | ||
| kv_quant_cfg_choices, | ||
| ): | ||
| quant_cfg = {} | ||
| if not auto_quantize: | ||
| assert qformat in quant_cfg_choices, ( | ||
| f"Unsupported quantization format: {qformat} with {kv_cache_qformat} KV cache" | ||
| ) | ||
|
|
||
| quant_cfg = quant_cfg_choices[qformat] | ||
|
|
||
| if "awq" in qformat: | ||
| quant_cfg = copy.deepcopy(quant_cfg_choices[qformat]) | ||
| weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"] | ||
| if isinstance(weight_quantizer, list): | ||
| weight_quantizer = weight_quantizer[0] | ||
| # If awq_block_size argument is provided, update weight_quantizer | ||
| if awq_block_size: | ||
| weight_quantizer["block_sizes"][-1] = awq_block_size | ||
|
|
||
| # Coarser optimal scale search seems to resolve the overflow in TRT-LLM for some models | ||
| if qformat == "w4a8_awq" and model_type in ["gemma", "mpt"]: | ||
| quant_cfg["algorithm"] = {"method": "awq_lite", "alpha_step": 1} | ||
|
|
||
| enable_quant_kv_cache = kv_cache_qformat != "none" | ||
| print(f"{'Enable' if enable_quant_kv_cache else 'Disable'} KV cache quantization") | ||
|
|
||
| # Check if any bmm_quantizer is in the quant_cfg. If so, we need to enable the bmm_quantizer. | ||
| if enable_quant_kv_cache: | ||
| quant_cfg = apply_kv_cache_quant( | ||
| quant_cfg, | ||
| getattr(mtq, kv_quant_cfg_choices[kv_cache_qformat])["quant_cfg"], | ||
| ) | ||
|
Comment on lines
+57
to
+80
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Always deep-copy quant_cfg to prevent shared state mutation. Non-AWQ formats use a shallow reference (line 57), but Apply this diff: - quant_cfg = quant_cfg_choices[qformat]
+ quant_cfg = copy.deepcopy(quant_cfg_choices[qformat])
if "awq" in qformat:
- quant_cfg = copy.deepcopy(quant_cfg_choices[qformat])
weight_quantizer = quant_cfg["quant_cfg"]["*weight_quantizer"]🤖 Prompt for AI Agents |
||
|
|
||
| # Gemma 7B has accuracy regression using alpha 1. We set 0.5 instead. | ||
| if model_type == "gemma" and "int8_sq" in qformat: | ||
| quant_cfg["algorithm"] = {"method": "smoothquant", "alpha": 0.5} | ||
|
|
||
| if model_type == "phi4mm": | ||
| # Only quantize the language model | ||
| quant_cfg["quant_cfg"]["*speech*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*audio*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*image*"] = {"enable": False} | ||
| quant_cfg["quant_cfg"]["*vision*"] = {"enable": False} | ||
|
|
||
| return quant_cfg | ||
|
|
||
|
|
||
| def is_speculative(hf_config): | ||
| """Check if the model architecture is a speculative model.""" | ||
| return hf_config.architectures and any( | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,30 @@ | ||
| # ============================================================================= | ||
| # FSDP Configuration for running LLM PTQ on multinode setup. This file is consumed by examples/llm_ptq/multinode_ptq.py | ||
| # ============================================================================= | ||
|
|
||
| compute_environment: LOCAL_MACHINE | ||
cjluo-nv marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| debug: false | ||
| distributed_type: FSDP | ||
| downcast_bf16: 'no' | ||
| enable_cpu_affinity: false | ||
| fsdp_config: | ||
| fsdp_activation_checkpointing: false | ||
| fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP | ||
| fsdp_cpu_ram_efficient_loading: true | ||
| fsdp_offload_params: false | ||
| fsdp_reshard_after_forward: true | ||
| fsdp_state_dict_type: FULL_STATE_DICT | ||
| fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer | ||
| fsdp_use_orig_params: true | ||
| fsdp_version: 2 | ||
| machine_rank: 0 | ||
| main_training_function: main | ||
| mixed_precision: 'no' | ||
| num_machines: 2 | ||
| num_processes: 16 | ||
| rdzv_backend: c10d | ||
| same_network: true | ||
| tpu_env: [] | ||
| tpu_use_cluster: false | ||
| tpu_use_sudo: false | ||
| use_cpu: false | ||
Uh oh!
There was an error while loading. Please reload this page.