Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .cd/benchmark/benchmark_scenarios_text.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ qwen25_14b_instruct:

qwen25_32b_instruct:
MODEL: Qwen/Qwen2.5-32B-Instruct
CONCURRENT_REQ: 8

qwen25_72b_instruct:
MODEL: Qwen/Qwen2.5-72B-Instruct
Expand Down
2 changes: 1 addition & 1 deletion .cd/benchmark/benchmark_user.env
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
MODEL
INPUT_TOK
OUTPUT_TOK
CON_REQ
CONCURRENT_REQ
NUM_PROMPTS
4 changes: 1 addition & 3 deletions .cd/server/server_output.env
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,21 @@ QUANT_DTYPE
BLOCK_SIZE
VLLM_PROMPT_BS_BUCKET_MIN
VLLM_PROMPT_BS_BUCKET_STEP
VLLM_PROMPT_BS_BUCKET_MAX
VLLM_DECODE_BS_BUCKET_MIN
VLLM_DECODE_BS_BUCKET_STEP
VLLM_PROMPT_SEQ_BUCKET_MIN
VLLM_PROMPT_SEQ_BUCKET_STEP
VLLM_DECODE_BLOCK_BUCKET_MIN
VLLM_DECODE_BLOCK_BUCKET_STEP
MAX_NUM_PREFILL_SEQS
NUM_HIDDEN_LAYERS
HIDDEN_SIZE
NUM_KEY_VALUE_HEADS
NUM_ATTENTION_HEADS
CACHE_DTYPE_BYTES
LIMIT_MODEL_LEN
PT_HPU_LAZY_MODE
VLLM_DELAYED_SAMPLING
VLLM_SKIP_WARMUP
EXPERIMENTAL_WEIGHT_SHARING
VLLM_EXPONENTIAL_BUCKETING
MAX_NUM_BATCHED_TOKENS
PT_HPU_ENABLE_LAZY_COLLECTIVES
Expand Down
3 changes: 3 additions & 0 deletions .cd/server/server_user.env
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ PT_HPU_LAZY_MODE
VLLM_DECODE_BLOCK_BUCKET_STEP
VLLM_DECODE_BS_BUCKET_STEP
VLLM_PROMPT_BS_BUCKET_STEP
VLLM_PROMPT_BS_BUCKET_MAX
VLLM_PROMPT_SEQ_BUCKET_STEP
VLLM_SKIP_WARMUP
MAX_MODEL_LEN
Expand All @@ -11,3 +12,5 @@ TENSOR_PARALLEL_SIZE
VLLM_EXPONENTIAL_BUCKETING
GPU_MEM_UTILIZATION
ASYNC_SCHEDULING
ENABLE_PREFIX_CACHING
EXTRA_ARGS
38 changes: 19 additions & 19 deletions .cd/server/settings_vllm.csv
Original file line number Diff line number Diff line change
@@ -1,19 +1,19 @@
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,MAX_NUM_PREFILL_SEQS,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_DELAYED_SAMPLING,VLLM_SKIP_WARMUP,EXPERIMENTAL_WEIGHT_SHARING,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,1
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,53687091200,2,2,48.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,TRUE,FALSE,0,FALSE,2048,true,true,1,0
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,TRUE,FALSE,0,FALSE,2048,false,false,1,0
MODEL,TENSOR_PARALLEL_SIZE,MAX_MODEL_LEN,TOTAL_GPU_MEM,UNAVAILABLE_MEM_ABS,MODEL_MEM_FROM_CONFIG,MODEL_DTYPE,QUANT_DTYPE,MODEL_MEM,PROFILER_MEM_OVERHEAD,APPROX_MEM_PER_GRAPH_MB,fsdpa,GPU_FREE_MEM_TARGET,BLOCK_SIZE,VLLM_PROMPT_BS_BUCKET_MIN,VLLM_PROMPT_BS_BUCKET_STEP,VLLM_DECODE_BS_BUCKET_MIN,VLLM_DECODE_BS_BUCKET_STEP,VLLM_PROMPT_SEQ_BUCKET_MIN,VLLM_PROMPT_SEQ_BUCKET_STEP,VLLM_DECODE_BLOCK_BUCKET_MIN,VLLM_DECODE_BLOCK_BUCKET_STEP,VLLM_PROMPT_BS_BUCKET_MAX,NUM_HIDDEN_LAYERS,HIDDEN_SIZE,NUM_KEY_VALUE_HEADS,NUM_ATTENTION_HEADS,CACHE_DTYPE_BYTES,LIMIT_MODEL_LEN,PT_HPU_LAZY_MODE,VLLM_SKIP_WARMUP,VLLM_EXPONENTIAL_BUCKETING,MAX_NUM_BATCHED_TOKENS,ENABLE_PREFIX_CACHING,VLLM_CONTIGUOUS_PA,VLLM_DEFRAG,ASYNC_SCHEDULING,VLLM_WEIGHT_LOAD_FORCE_SYNC
meta-llama/Llama-3.1-8B-Instruct,1,4352,128,2,16060522496,2,2,14.95752716,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
meta-llama/Llama-3.1-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
meta-llama/Llama-3.3-70B-Instruct,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
meta-llama/Llama-3.2-1B-Instruct,1,4352,128,2,2471645608,2,2,2.301899351,5.5,5,1,1,128,1,32,1,32,128,256,128,256,1,16,2048,8,32,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
meta-llama/Llama-3.2-3B-Instruct,1,4352,128,2,6425499648,2,2,5.984212875,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,28,3072,8,24,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
mistralai/Mixtral-8x7B-Instruct-v0.1,2,4352,256,2,93405585408,2,2,86.99073029,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
mistralai/Mixtral-8x22B-Instruct-v0.1,4,4352,512,2,2.8126E+11,2,2,261.9439201,5.5,10,1,1,128,1,32,1,32,128,256,128,256,1,56,6144,8,48,2,65536,1,FALSE,FALSE,2048,false,true,true,1,1
mistralai/Mistral-7B-Instruct-v0.2,1,4352,128,2,14483464192,2,2,13.48877716,5.5,10,1,9,128,1,32,1,32,128,256,128,256,1,32,4096,8,32,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
meta-llama/Llama-3.1-405B-Instruct,8,4352,1024,2,8.11707E+11,2,2,755.9608459,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,126,16384,8,128,2,131072,1,FALSE,FALSE,2048,false,true,true,1,1
Qwen/Qwen2.5-14B-Instruct,1,4352,128,2,29540067328,2,2,27.51133156,5.5,10,0,12,128,1,32,1,32,128,256,128,256,1,48,5120,8,40,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
deepseek-ai/DeepSeek-R1-Distill-Llama-70B,4,4352,512,2,1.41107E+11,2,2,131.4165192,5.5,20,1,1,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,131072,1,FALSE,FALSE,2048,false,true,true,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,1,16,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,FALSE,FALSE,2048,false,true,true,1,1
Qwen/Qwen2.5-72B-Instruct,4,4352,512,2,1.45412E+11,2,2,135.4258575,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,80,8192,8,64,2,32768,1,FALSE,FALSE,2048,false,true,true,1,1
Qwen/Qwen2.5-7B-Instruct,1,4352,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
Qwen/Qwen2.5-32B-Instruct,1,4352,128,2,65527752704,2,2,61.02747536,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,64,5120,8,40,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
ibm-granite/granite-8b-code-instruct-4k,1,4096,128,2,21474836480,2,2,20.00000000,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,36,4096,8,32,2,32768,1,FALSE,FALSE,2048,false,true,true,1,0
ibm-granite/granite-20b-code-instruct-8k,1,4352,128,2,40133986304,2,2,37.37,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,52,6144,1,48,2,65536,1,FALSE,FALSE,2048,false,true,true,1,0
Qwen/Qwen2.5-VL-7B-Instruct,1,8448,128,2,15231233024,2,2,14.18519115,5.5,10,0,3,128,1,32,1,32,128,256,128,256,1,28,3584,4,28,2,32768,1,FALSE,FALSE,2048,false,false,false,1,0
23 changes: 18 additions & 5 deletions .cd/server/vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,14 @@ def calc_PT_HPU_ENABLE_LAZY_COLLECTIVES(ctx):
return ctx['TENSOR_PARALLEL_SIZE'] > 1


def calc_VLLM_CONTIGUOUS_PA(ctx):
return not ctx['ENABLE_PREFIX_CACHING']


def calc_VLLM_DEFRAG(ctx):
return bool(ctx['VLLM_CONTIGUOUS_PA'])


def calc_MODEL_MEM_FROM_CONFIG(ctx):
return float(ctx.get('MODEL_MEM_FROM_CONFIG'))

Expand Down Expand Up @@ -93,13 +101,15 @@ def calc_NUM_DECODE_GRAPHS(ctx):
def calc_PROMPT_BS_RAMP_GRAPHS(ctx):
return 1 + int(
math.log(
min(ctx['MAX_NUM_PREFILL_SEQS'], ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_MIN'], 2))
min(ctx['VLLM_PROMPT_BS_BUCKET_MAX'], ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_MIN'],
2))


def calc_PROMPT_BS_STEP_GRAPHS(ctx):
return max(
0,
int(1 + (ctx['MAX_NUM_PREFILL_SEQS'] - ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_STEP']))
int(1 +
(ctx['VLLM_PROMPT_BS_BUCKET_MAX'] - ctx['VLLM_PROMPT_BS_BUCKET_STEP']) / ctx['VLLM_PROMPT_BS_BUCKET_STEP']))


def calc_PROMPT_SEQ_RAMP_GRAPHS(ctx):
Expand Down Expand Up @@ -155,10 +165,11 @@ def calc_MAX_NUM_SEQS(ctx):
return max(1, ctx['MAX_NUM_SEQS'])
# Otherwise, calculate
val = (ctx['TENSOR_PARALLEL_SIZE'] * ctx['KV_CACHE_MEM'] / ctx['KV_CACHE_PER_SEQ'])
if ctx['DTYPE'] == 'fp8':
val = (max(1, math.floor(val / ctx['VLLM_DECODE_BS_BUCKET_STEP'])) * ctx['VLLM_DECODE_BS_BUCKET_STEP'])
# always round down for plugin as WA
if val < ctx['VLLM_DECODE_BS_BUCKET_STEP']:
val = pow(2, math.floor(math.log(val, 2)))
else:
val = (math.ceil(val / ctx['VLLM_DECODE_BS_BUCKET_STEP']) * ctx['VLLM_DECODE_BS_BUCKET_STEP'])
val = max(1, math.floor(val / ctx['VLLM_DECODE_BS_BUCKET_STEP'])) * ctx['VLLM_DECODE_BS_BUCKET_STEP']
# Special limit for Vision-Instruct models
if ctx['MODEL'] in ['meta-llama/Llama-3.2-11B-Vision-Instruct', 'meta-llama/Llama-3.2-90B-Vision-Instruct'
] and val > 128:
Expand All @@ -184,6 +195,8 @@ def calc_VLLM_PROMPT_SEQ_BUCKET_MAX(ctx):
"TENSOR_PARALLEL_SIZE": calc_TENSOR_PARALLEL_SIZE,
"MAX_MODEL_LEN": calc_MAX_MODEL_LEN,
"PT_HPU_ENABLE_LAZY_COLLECTIVES": calc_PT_HPU_ENABLE_LAZY_COLLECTIVES,
"VLLM_CONTIGUOUS_PA": calc_VLLM_CONTIGUOUS_PA,
"VLLM_DEFRAG": calc_VLLM_DEFRAG,
"MODEL_MEM_FROM_CONFIG": calc_MODEL_MEM_FROM_CONFIG,
"DEVICE_HPU_MEM": calc_DEVICE_HPU_MEM,
"TOTAL_GPU_MEM": calc_TOTAL_GPU_MEM,
Expand Down
4 changes: 4 additions & 0 deletions .cd/templates/template_vllm_server.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

#@VARS

if [ "$VLLM_CONTIGUOUS_PA" == "True" ]; then # Checks if using contigous pa
EXTRA_ARGS+=" --no-enable-prefix-caching"
fi

if [ $ASYNC_SCHEDULING -gt 0 ]; then # Checks if using async scheduling
EXTRA_ARGS+=" --async_scheduling"
fi
Expand Down
4 changes: 2 additions & 2 deletions .cd/tests/test_vllm_autocalc_rules.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,13 +125,13 @@ def test_calc_NUM_DECODE_GRAPHS(cpa):


def test_calc_PROMPT_BS_RAMP_GRAPHS():
ctx = {'MAX_NUM_PREFILL_SEQS': 16, 'VLLM_PROMPT_BS_BUCKET_STEP': 8, 'VLLM_PROMPT_BS_BUCKET_MIN': 2}
ctx = {'VLLM_PROMPT_BS_BUCKET_MAX': 16, 'VLLM_PROMPT_BS_BUCKET_STEP': 8, 'VLLM_PROMPT_BS_BUCKET_MIN': 2}
expected = 1 + int(math.log(min(16, 8) / 2, 2))
assert rules.calc_PROMPT_BS_RAMP_GRAPHS(ctx) == expected


def test_calc_PROMPT_BS_STEP_GRAPHS():
ctx = {'MAX_NUM_PREFILL_SEQS': 32, 'VLLM_PROMPT_BS_BUCKET_STEP': 8}
ctx = {'VLLM_PROMPT_BS_BUCKET_MAX': 32, 'VLLM_PROMPT_BS_BUCKET_STEP': 8}
expected = max(0, int(1 + (32 - 8) / 8))
assert rules.calc_PROMPT_BS_STEP_GRAPHS(ctx) == expected

Expand Down