@@ -291,6 +291,16 @@ jobs:
291
291
bash .ci/scripts/validate.sh "./checkpoints/${REPO_NAME}/model.pth" "cuda" "aoti-bfloat16"
292
292
echo "::endgroup::"
293
293
294
+ echo "::group::Run inference with quantize file"
295
+ for DEVICE in cpu; do # cuda
296
+ # cuda - fails because `AttributeError: 'Linear' object has no attribute '_linear_extra_repr'`
297
+ # follow up with torchao as a separate PR
298
+ echo "saving snapshot for device ${DEVICE} and dtype bfloat16, and reloading as snapshot"
299
+ python3 torchchat.py export --device ${DEVICE} --output-snap model.tc --dtype bfloat16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
300
+ python3 torchchat.py generate --device ${DEVICE} --snap model.tc --dtype bfloat16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
301
+ done
302
+ echo "::endgroup::"
303
+
294
304
test-gpu-aoti-float32 :
295
305
permissions :
296
306
id-token : write
@@ -335,6 +345,11 @@ jobs:
335
345
fi
336
346
echo "::endgroup::"
337
347
348
+ # echo "::group::Run inference with quantize file"
349
+ # python3 torchchat.py export --output-snap model.tc --dtype float32 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
350
+ # python3 torchchat.py generate --snap model.tc --dtype float32 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
351
+ # echo "::endgroup::"
352
+
338
353
test-gpu-aoti-float16 :
339
354
permissions :
340
355
id-token : write
@@ -376,10 +391,15 @@ jobs:
376
391
echo "::group::Run inference with quantize file"
377
392
if [ $(uname -s) == Darwin ]; then
378
393
python3 torchchat.py export --output-aoti-package-path /tmp/model.pt2 --quantize torchchat/quant_config/cuda.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
379
- python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
394
+ python3 torchchat.py generate --aoti-package-path /tmp/model.pt2 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"~
380
395
fi
381
396
echo "::endgroup::"
382
397
398
+ # echo "::group::Run inference with quantize file"
399
+ # python3 torchchat.py export --output-snap model.tc --dtype float16 --quantize torchchat/quant_config/cuda-32.json --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
400
+ # python3 torchchat.py generate --snap model.tc --dtype float16 --checkpoint "./checkpoints/${REPO_NAME}/model.pth"
401
+ # echo "::endgroup::"
402
+
383
403
test-gpu-eval-sanity-check :
384
404
permissions :
385
405
id-token : write
@@ -495,10 +515,11 @@ jobs:
495
515
python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
496
516
497
517
echo "******************************************"
498
- echo "*** --quantize torchchat/quant_config/mobile.json ***"
518
+ echo "*** can't test --quantize torchchat/quant_config/mobile.json ***"
519
+ echo "*** testing --quantize torchchat/quant_config/mobile-32.json ***"
499
520
echo "******************************************"
500
- # python torchchat.py export --quantize torchchat/quant_config/mobile.json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
501
- # python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
521
+ python torchchat.py export --quantize torchchat/quant_config/mobile-32 .json --checkpoint-path ${MODEL_PATH} --output-pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
522
+ python3 torchchat.py generate --checkpoint-path ${MODEL_PATH} --temperature 0 --pte-path ${MODEL_DIR}/${MODEL_NAME}.pte
502
523
503
524
504
525
echo "******************************************"
0 commit comments