Update documentation from main repository

future-xy · future-xy · commit 196cc92f178e · 2025-06-17T10:29:00.000Z
diff --git a/docs/api/cli.md b/docs/api/cli.md
@@ -154,6 +154,52 @@ This file can be incomplete, and missing sections will be filled in by the defau
 }
 ```
 
+##### Example Quantization Configuration (`config.json`)
+`quantization_config` can be obtained from any configuration used in `transformers` via the `.to_json_file(filename)` function:
+
+```python
+quantization_config = BitsAndBytesConfig(load_in_8bit=True)
+quantization_config.to_json_file("quantization_config.json")
+
+```
+Then copy it into `config.json`:
+
+```json
+{
+    "model": "",
+    "backend": "transformers",
+    "num_gpus": 1,
+    "auto_scaling_config": {
+        "metric": "concurrency",
+        "target": 1,
+        "min_instances": 0,
+        "max_instances": 10,
+        "keep_alive": 0
+    },
+    "backend_config": {
+        "pretrained_model_name_or_path": "",
+        "device_map": "auto",
+        "torch_dtype": "float16",
+        "hf_model_class": "AutoModelForCausalLM",
+        "quantization_config": {
+            "_load_in_4bit": false,
+            "_load_in_8bit": true,
+            "bnb_4bit_compute_dtype": "float32",
+            "bnb_4bit_quant_storage": "uint8",
+            "bnb_4bit_quant_type": "fp4",
+            "bnb_4bit_use_double_quant": false,
+            "llm_int8_enable_fp32_cpu_offload": false,
+            "llm_int8_has_fp16_weight": false,
+            "llm_int8_skip_modules": null,
+            "llm_int8_threshold": 6.0,
+            "load_in_4bit": false,
+            "load_in_8bit": true,
+            "quant_method": "bitsandbytes"
+        }
+    }
+}
+```
+
 Below is a description of all the fields in config.json.
 
 | Field | Description |
@@ -174,6 +220,7 @@ Below is a description of all the fields in config.json.
 | backend_config.hf_model_class | HuggingFace model class. |
 | backend_config.enable_lora | Set to true to enable loading LoRA adapters during inference. |
 | backend_config.lora_adapters| A dictionary of LoRA adapters in the format `{name: path}`, where each path is a local or Hugging Face-hosted LoRA adapter directory. |
+| backend_config.quantization_config| A dictionary specifying the desired `BitsAndBytesConfig`. Can be obtained by saving a `BitsAndBytesConfig` to JSON via `BitsAndBytesConfig.to_json_file(filename). Defaults to None.|
 
 ### sllm-cli delete
 Delete deployed models by name, or delete specific LoRA adapters associated with a base model.
@@ -406,4 +453,4 @@ sllm-cli status
 #### Example
 ```bash
 sllm-cli status
-```
+```