diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index e1a5d4ee28ea1..f407e879bffc9 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -232,14 +232,17 @@ def run_hf(
     use_beam_search: bool,
     max_batch_size: int,
     trust_remote_code: bool,
+    device: str,
 ) -> float:
     assert not use_beam_search
+    is_cuda = device == "cuda"
+    if is_cuda:
+        llm = llm.cuda()
     llm = AutoModelForCausalLM.from_pretrained(
-        model, torch_dtype=torch.float16, trust_remote_code=trust_remote_code)
+        model, torch_dtype=torch.float16 if is_cuda else torch.float32, trust_remote_code=trust_remote_code)
     if llm.config.model_type == "llama":
         # To enable padding in the HF backend.
         tokenizer.pad_token = tokenizer.eos_token
-    llm = llm.cuda()
 
     pbar = tqdm(total=len(requests))
     start = time.perf_counter()
@@ -264,7 +267,7 @@ def run_hf(
         input_ids = tokenizer(batch, return_tensors="pt",
                               padding=True).input_ids
         llm_outputs = llm.generate(
-            input_ids=input_ids.cuda(),
+            input_ids=input_ids.cuda() if is_cuda else input_ids,
             do_sample=not use_beam_search,
             num_return_sequences=n,
             temperature=1.0,
@@ -341,7 +344,7 @@ def main(args: argparse.Namespace):
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
                               args.use_beam_search, args.hf_max_batch_size,
-                              args.trust_remote_code)
+                              args.trust_remote_code, args.device)
     elif args.backend == "mii":
         elapsed_time = run_mii(requests, args.model, args.tensor_parallel_size,
                                args.output_len)