diff --git a/tests/spyre/spyre_util.py b/tests/spyre/spyre_util.py index c2985453c..e9b96c375 100644 --- a/tests/spyre/spyre_util.py +++ b/tests/spyre/spyre_util.py @@ -281,3 +281,40 @@ def compare_embedding_results(model: str, prompts: List[str], vllm_result["embeddings"]) assert math.isclose(sim, 1.0, rel_tol=0.05) + + +# get model directory path from env, if not set then default to "/models". +def get_spyre_model_dir_path(): + model_dir_path = os.environ.get("SPYRE_TEST_MODEL_DIR", "/models") + return model_dir_path + + +# get model backend from env, if not set then default to "eager" +# For multiple values: +# export SPYRE_TEST_BACKEND_LIST="eager, inductor, sendnn_decoder" +def get_spyre_backend_list(): + test_backend_list = [] + user_backend_list = os.environ.get("SPYRE_TEST_BACKEND_LIST", "eager") + + for backend in user_backend_list.split(","): + test_backend_list.append(backend.strip()) + return test_backend_list + + +# get model names from env, if not set then default to "llama-194m" +# For multiple values: +# export SPYRE_TEST_MODEL_LIST="llama-194m,all-roberta-large-v1" +def get_spyre_model_list(isEmbeddings=False): + spyre_model_dir_path = get_spyre_model_dir_path() + test_model_list = [] + user_test_model_list = os.environ.get("SPYRE_TEST_MODEL_LIST", + "llama-194m") + + # set default to bert if testing embeddings + if isEmbeddings: + user_test_model_list = os.environ.get("SPYRE_TEST_MODEL_LIST", + "all-roberta-large-v1") + + for model in user_test_model_list.split(","): + test_model_list.append(f"{spyre_model_dir_path}/{model.strip()}") + return test_model_list diff --git a/tests/spyre/test_spyre_basic.py b/tests/spyre/test_spyre_basic.py index 0cd53a72a..6ab4db02f 100644 --- a/tests/spyre/test_spyre_basic.py +++ b/tests/spyre/test_spyre_basic.py @@ -7,22 +7,27 @@ import pytest from spyre_util import (compare_results, generate_hf_output, - generate_spyre_vllm_output) + generate_spyre_vllm_output, get_spyre_backend_list, + get_spyre_model_list) from vllm import SamplingParams -@pytest.mark.parametrize("model", ["/models/llama-194m"]) -@pytest.mark.parametrize("prompts", [[ - "Provide a list of instructions for preparing" - " chicken soup for a family of four.", "Hello", - "What is the weather today like?", "Who are you?" -]]) +@pytest.mark.parametrize("model", get_spyre_model_list()) +@pytest.mark.parametrize( + "prompts", + [[ + "Provide a list of instructions for preparing" + " chicken soup for a family of four.", + "Hello", + "What is the weather today like?", + "Who are you?", + ]], +) @pytest.mark.parametrize("warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4), (128, 20, 8)] ) # (prompt_length/new_tokens/batch_size) -@pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) def test_output( model: str, prompts: List[str], @@ -48,7 +53,8 @@ def test_output( max_tokens=max_new_tokens, temperature=0, logprobs=0, # return logprobs of generated tokens only - ignore_eos=True) + ignore_eos=True, + ) vllm_results = generate_spyre_vllm_output( model=model, @@ -58,16 +64,19 @@ def test_output( block_size=2048, sampling_params=vllm_sampling_params, tensor_parallel_size=1, - backend=backend) + backend=backend, + ) hf_results = generate_hf_output(model=model, prompts=prompts, max_new_tokens=max_new_tokens) - compare_results(model=model, - prompts=prompts, - warmup_shapes=[warmup_shape], - tensor_parallel_size=1, - backend=backend, - vllm_results=vllm_results, - hf_results=hf_results) + compare_results( + model=model, + prompts=prompts, + warmup_shapes=[warmup_shape], + tensor_parallel_size=1, + backend=backend, + vllm_results=vllm_results, + hf_results=hf_results, + ) diff --git a/tests/spyre/test_spyre_embeddings.py b/tests/spyre/test_spyre_embeddings.py index 8c8056158..4289ea677 100644 --- a/tests/spyre/test_spyre_embeddings.py +++ b/tests/spyre/test_spyre_embeddings.py @@ -7,21 +7,26 @@ import pytest from spyre_util import (compare_embedding_results, spyre_vllm_embeddings, - st_embeddings) - - -@pytest.mark.parametrize("model", ["/models/all-roberta-large-v1"]) -@pytest.mark.parametrize("prompts", [[ - "The capital of France is Paris." - "Provide a list of instructions for preparing" - " chicken soup for a family of four.", "Hello", - "What is the weather today like?", "Who are you?" -]]) + st_embeddings, get_spyre_backend_list, + get_spyre_model_list) + + +@pytest.mark.parametrize("model", get_spyre_model_list(isEmbeddings=True)) +@pytest.mark.parametrize( + "prompts", + [[ + "The capital of France is Paris." + "Provide a list of instructions for preparing" + " chicken soup for a family of four.", + "Hello", + "What is the weather today like?", + "Who are you?", + ]], +) @pytest.mark.parametrize("warmup_shape", [(64, 4), (64, 8), (128, 4), (128, 8)]) # (prompt_length/new_tokens/batch_size) -@pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) def test_output( model: str, prompts: List[str], @@ -35,20 +40,24 @@ def test_output( are verified to be identical for vLLM and SentenceTransformers. ''' - vllm_results = spyre_vllm_embeddings(model=model, - prompts=prompts, - warmup_shapes=[warmup_shape], - max_model_len=256, - block_size=256, - tensor_parallel_size=1, - backend=backend) + vllm_results = spyre_vllm_embeddings( + model=model, + prompts=prompts, + warmup_shapes=[warmup_shape], + max_model_len=256, + block_size=256, + tensor_parallel_size=1, + backend=backend, + ) hf_results = st_embeddings(model=model, prompts=prompts) - compare_embedding_results(model=model, - prompts=prompts, - warmup_shapes=[warmup_shape], - tensor_parallel_size=1, - backend=backend, - vllm_results=vllm_results, - hf_results=hf_results) + compare_embedding_results( + model=model, + prompts=prompts, + warmup_shapes=[warmup_shape], + tensor_parallel_size=1, + backend=backend, + vllm_results=vllm_results, + hf_results=hf_results, + ) diff --git a/tests/spyre/test_spyre_max_new_tokens.py b/tests/spyre/test_spyre_max_new_tokens.py index fccb67084..9a301fb50 100644 --- a/tests/spyre/test_spyre_max_new_tokens.py +++ b/tests/spyre/test_spyre_max_new_tokens.py @@ -6,8 +6,11 @@ from typing import List, Tuple import pytest -from spyre_util import (compare_results, generate_hf_output, - generate_spyre_vllm_output) +from spyre_util import ( + compare_results, + generate_hf_output, + generate_spyre_vllm_output, +) from vllm import SamplingParams @@ -22,14 +25,19 @@ @pytest.mark.parametrize("model", ["/models/llama-194m"]) -@pytest.mark.parametrize("prompts", [[prompt1, prompt2, prompt2, prompt2], - [prompt2, prompt2, prompt2, prompt1], - [prompt2, prompt2, prompt2, prompt2]]) +@pytest.mark.parametrize( + "prompts", + [ + [prompt1, prompt2, prompt2, prompt2], + [prompt2, prompt2, prompt2, prompt1], + [prompt2, prompt2, prompt2, prompt2], + ], +) @pytest.mark.parametrize("stop_last", [True, False]) @pytest.mark.parametrize("warmup_shape", [(64, 10, 4)] ) # (prompt_length/new_tokens/batch_size) @pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) + ["eager"]) # , "inductor", "sendnn_decoder"]) def test_output( model: str, prompts: List[str], @@ -57,13 +65,15 @@ def test_output( max_tokens=max_new_tokens_warmup, temperature=0, logprobs=0, # return logprobs of generated tokens only - ignore_eos=False) + ignore_eos=False, + ) vllm_sampling_params_early_stop = SamplingParams( max_tokens=max_new_tokens_early_stop, temperature=0, logprobs=0, # return logprobs of generated tokens only - ignore_eos=False) + ignore_eos=False, + ) vllm_sampling_params = [vllm_sampling_params_normal] * 3 hf_max_new_tokens = [max_new_tokens_warmup] * 3 @@ -87,16 +97,19 @@ def test_output( block_size=2048, sampling_params=vllm_sampling_params, tensor_parallel_size=1, - backend=backend) + backend=backend, + ) hf_results = generate_hf_output(model=model, prompts=prompts, max_new_tokens=hf_max_new_tokens) - compare_results(model=model, - prompts=prompts, - warmup_shapes=[warmup_shape], - tensor_parallel_size=1, - backend=backend, - vllm_results=vllm_results, - hf_results=hf_results) + compare_results( + model=model, + prompts=prompts, + warmup_shapes=[warmup_shape], + tensor_parallel_size=1, + backend=backend, + vllm_results=vllm_results, + hf_results=hf_results, + ) diff --git a/tests/spyre/test_spyre_max_prompt_length.py b/tests/spyre/test_spyre_max_prompt_length.py index e2fdd9e18..654da41a2 100644 --- a/tests/spyre/test_spyre_max_prompt_length.py +++ b/tests/spyre/test_spyre_max_prompt_length.py @@ -7,13 +7,15 @@ import pytest from spyre_util import (compare_results, generate_hf_output, - generate_spyre_vllm_output) + generate_spyre_vllm_output, get_spyre_backend_list, + get_spyre_model_list) + from transformers import AutoTokenizer from vllm import SamplingParams -@pytest.mark.parametrize("model", ["/models/llama-194m"]) +@pytest.mark.parametrize("model", get_spyre_model_list()) @pytest.mark.parametrize("prompts", [ 7 * [ "Hello", @@ -22,14 +24,14 @@ " to the user. Provide a list of instructions for preparing chicken " "soup for a family of four. Indicate if the weather forecast looks " "good for today. Explain in a brief summary comprised of at most 50" - " words what you are." + " words what you are.", ] -]) + ], +) @pytest.mark.parametrize("warmup_shapes", [[(64, 20, 4)], [(64, 20, 4), (128, 20, 4)]] ) # (prompt_length/new_tokens/batch_size) -@pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) def test_output( model: str, prompts: List[str], @@ -62,7 +64,8 @@ def test_output( max_tokens=max_new_tokens, temperature=0, logprobs=0, # return logprobs of generated tokens only - ignore_eos=True) + ignore_eos=True, + ) vllm_results = generate_spyre_vllm_output( model=model, @@ -72,7 +75,8 @@ def test_output( block_size=2048, sampling_params=vllm_sampling_params, tensor_parallel_size=1, - backend=backend) + backend=backend, + ) hf_results = generate_hf_output(model=model, prompts=prompts, @@ -86,16 +90,18 @@ def test_output( hf_input_tokens = hf_tokenizer(prompt, return_tensors="pt").input_ids if len(hf_input_tokens[0]) > max_prompt_length: hf_results[prompt_index] = { - 'text': '', - 'token_ids': (), - 'tokens': (), - 'logprobs': () + "text": "", + "token_ids": (), + "tokens": (), + "logprobs": (), } - compare_results(model=model, - prompts=prompts, - warmup_shapes=warmup_shapes, - tensor_parallel_size=1, - backend=backend, - vllm_results=vllm_results, - hf_results=hf_results) + compare_results( + model=model, + prompts=prompts, + warmup_shapes=warmup_shapes, + tensor_parallel_size=1, + backend=backend, + vllm_results=vllm_results, + hf_results=hf_results, + ) diff --git a/tests/spyre/test_spyre_seed.py b/tests/spyre/test_spyre_seed.py index 01bebdce5..6f6939a0d 100644 --- a/tests/spyre/test_spyre_seed.py +++ b/tests/spyre/test_spyre_seed.py @@ -7,23 +7,26 @@ from typing import Tuple import pytest -from spyre_util import generate_spyre_vllm_output +from spyre_util import (generate_spyre_vllm_output, get_spyre_backend_list, + get_spyre_model_list) from vllm import SamplingParams -@pytest.mark.parametrize("model", ["/models/llama-194m"]) -@pytest.mark.parametrize("prompt", [ - "Provide a list of instructions for preparing" - " chicken soup for a family of four." -]) +@pytest.mark.parametrize("model", get_spyre_model_list()) +@pytest.mark.parametrize( + "prompt", + [ + "Provide a list of instructions for preparing" + " chicken soup for a family of four." + ], +) @pytest.mark.parametrize("temperature", [0.1, 1.0]) @pytest.mark.parametrize("seed", [42]) @pytest.mark.parametrize("warmup_shape", [(64, 20, 4), (64, 20, 8), (128, 20, 4), (128, 20, 8)] ) # (prompt_length/new_tokens/batch_size) -@pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) def test_seed( model: str, prompt: str, @@ -49,7 +52,8 @@ def test_seed( temperature=temperature, logprobs=0, # return logprobs of generated tokens only ignore_eos=True, - seed=seed) + seed=seed, + ) vllm_results = generate_spyre_vllm_output( model=model, @@ -59,17 +63,21 @@ def test_seed( block_size=2048, sampling_params=vllm_sampling_params, tensor_parallel_size=1, - backend=backend) + backend=backend, + ) # compare all generated outputs against the first generated output for vllm_result in vllm_results: - assert vllm_result['text'] == vllm_results[0]['text'] + assert vllm_result["text"] == vllm_results[0]["text"] # compare logprobs for all tokens between # the current and the first sequence - assert len(vllm_result['logprobs']) == len(vllm_results[0]['logprobs']) + assert len(vllm_result["logprobs"]) == len(vllm_results[0]["logprobs"]) for token_id, logprob, token_id_0, logprob_0 in zip( - vllm_result['token_ids'], vllm_result['logprobs'], - vllm_results[0]['token_ids'], vllm_results[0]['logprobs']): + vllm_result["token_ids"], + vllm_result["logprobs"], + vllm_results[0]["token_ids"], + vllm_results[0]["logprobs"], + ): assert token_id == token_id_0 assert math.isclose(logprob, logprob_0, rel_tol=0.1) diff --git a/tests/spyre/test_spyre_tensor_parallel.py b/tests/spyre/test_spyre_tensor_parallel.py index f6d2626fc..a53721039 100644 --- a/tests/spyre/test_spyre_tensor_parallel.py +++ b/tests/spyre/test_spyre_tensor_parallel.py @@ -7,23 +7,28 @@ import pytest from spyre_util import (compare_results, generate_hf_output, - generate_spyre_vllm_output) + generate_spyre_vllm_output, get_spyre_backend_list, + get_spyre_model_list) from vllm import SamplingParams -@pytest.mark.parametrize("model", ["/models/llama-194m"]) -@pytest.mark.parametrize("prompts", [[ - "Provide a list of instructions for preparing" - " chicken soup for a family of four.", "Hello", - "What is the weather today like?", "Who are you?" -]]) -@pytest.mark.parametrize("warmup_shapes", [[(64, 20, 4)]] - ) #,[(64,20,8)],[(128,20,4)],[(128,20,8)]]) +@pytest.mark.parametrize("model", get_spyre_model_list()) +@pytest.mark.parametrize( + "prompts", + [[ + "Provide a list of instructions for preparing" + " chicken soup for a family of four.", + "Hello", + "What is the weather today like?", + "Who are you?", + ]], +) +@pytest.mark.parametrize("warmup_shapes", [[(64, 20, 1)]] + ) # ,[(64,20,8)],[(128,20,4)],[(128,20,8)]]) # (prompt_length/new_tokens/batch_size) @pytest.mark.parametrize("tp_size", [2]) -@pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) def test_output( model: str, prompts: List[str], @@ -51,7 +56,8 @@ def test_output( max_tokens=max_new_tokens, temperature=0, logprobs=0, # return logprobs of generated tokens only - ignore_eos=True) + ignore_eos=True, + ) vllm_results = generate_spyre_vllm_output( model=model, @@ -61,16 +67,19 @@ def test_output( block_size=2048, sampling_params=vllm_sampling_params, tensor_parallel_size=tp_size, - backend=backend) + backend=backend, + ) hf_results = generate_hf_output(model=model, prompts=prompts, max_new_tokens=max_new_tokens) - compare_results(model=model, - prompts=prompts, - warmup_shapes=warmup_shapes, - tensor_parallel_size=tp_size, - backend=backend, - vllm_results=vllm_results, - hf_results=hf_results) + compare_results( + model=model, + prompts=prompts, + warmup_shapes=warmup_shapes, + tensor_parallel_size=tp_size, + backend=backend, + vllm_results=vllm_results, + hf_results=hf_results, + ) diff --git a/tests/spyre/test_spyre_warmup_shapes.py b/tests/spyre/test_spyre_warmup_shapes.py index 675b9188f..2aaa8d507 100644 --- a/tests/spyre/test_spyre_warmup_shapes.py +++ b/tests/spyre/test_spyre_warmup_shapes.py @@ -7,12 +7,13 @@ import pytest from spyre_util import (compare_results, generate_hf_output, - generate_spyre_vllm_output) + generate_spyre_vllm_output, get_spyre_backend_list, + get_spyre_model_list) from vllm import SamplingParams -@pytest.mark.parametrize("model", ["/models/llama-194m"]) +@pytest.mark.parametrize("model", get_spyre_model_list()) @pytest.mark.parametrize("prompts", [ 7 * [ "Hello", @@ -21,13 +22,13 @@ "the user. Provide a list of instructions for preparing chicken soup" " for a family of four. Indicate if the weather forecast looks good " "for today. Explain in a brief summary comprised of at most 50 words" - " what you are." + " what you are.", ] -]) + ], +) @pytest.mark.parametrize("warmup_shapes", [[(64, 20, 8), (128, 20, 4)]] ) # (prompt_length/new_tokens/batch_size) -@pytest.mark.parametrize("backend", - ["eager"]) #, "inductor", "sendnn_decoder"]) +@pytest.mark.parametrize("backend", get_spyre_backend_list()) def test_output( model: str, prompts: List[str], @@ -60,7 +61,8 @@ def test_output( max_tokens=max_new_tokens, temperature=0, logprobs=0, # return logprobs of generated tokens only - ignore_eos=True) + ignore_eos=True, + ) vllm_results = generate_spyre_vllm_output( model=model, @@ -70,16 +72,19 @@ def test_output( block_size=2048, sampling_params=vllm_sampling_params, tensor_parallel_size=1, - backend=backend) + backend=backend, + ) hf_results = generate_hf_output(model=model, prompts=prompts, max_new_tokens=max_new_tokens) - compare_results(model=model, - prompts=prompts, - warmup_shapes=warmup_shapes, - tensor_parallel_size=1, - backend=backend, - vllm_results=vllm_results, - hf_results=hf_results) + compare_results( + model=model, + prompts=prompts, + warmup_shapes=warmup_shapes, + tensor_parallel_size=1, + backend=backend, + vllm_results=vllm_results, + hf_results=hf_results, + )