File tree Expand file tree Collapse file tree 4 files changed +31
-2
lines changed Expand file tree Collapse file tree 4 files changed +31
-2
lines changed Original file line number Diff line number Diff line change 44 pull_request :
55 branches :
66 - main
7+ push :
8+ branches :
9+ - main
710
811concurrency :
912 group : ${{ github.workflow }}-${{ github.ref }}
Original file line number Diff line number Diff line change 44 pull_request :
55 branches :
66 - main
7+ push :
8+ branches :
9+ - main
710
811concurrency :
912 group : ${{ github.workflow }}-${{ github.ref }}
Original file line number Diff line number Diff line change @@ -40,7 +40,15 @@ def main():
4040 "modifications" : [
4141 (
4242 "torch.cuda.empty_cache()" ,
43- "if use_cuda: torch.cuda.empty_cache()" ,
43+ "if use_cuda: torch.cuda.empty_cache()"
44+ ),
45+ (
46+ "init_process_group(backend='nccl')" ,
47+ "init_process_group(backend='gloo')"
48+ ),
49+ (
50+ "torch.cuda.set_device(int(os.environ['LOCAL_RANK']))" ,
51+ "# torch.cuda.set_device(int(os.environ['LOCAL_RANK']))"
4452 )
4553 ],
4654 }
Original file line number Diff line number Diff line change @@ -4,7 +4,22 @@ current_directory=$(pwd)
44llmc=$( echo " $current_directory " | sed ' s/\/ci_check$//' )
55export PYTHONPATH=$llmc :$PYTHONPATH
66
7+ config=${llmc} /ci_check/awq_w4a16_fakequant_eval.yml
8+
9+ nnodes=1
10+ nproc_per_node=1
11+ MASTER_ADDR=127.0.0.1
12+ MASTER_PORT=$(( 10000 + RANDOM % 20000 ))
13+
14+ RANDOM=$( python -c ' import uuid; print(uuid.uuid4())' )
15+ task_id=$RANDOM
716
817cd ../scripts
918
10- python -m llmc --config ../ci_check/awq_w4a16_fakequant_eval.yml
19+ torchrun \
20+ --nnodes $nnodes \
21+ --nproc_per_node $nproc_per_node \
22+ --rdzv_id $task_id \
23+ --rdzv_backend c10d \
24+ --rdzv_endpoint $MASTER_ADDR :$MASTER_PORT \
25+ ${llmc} /llmc/__main__.py --config $config --task_id $task_id \
You can’t perform that action at this time.
0 commit comments