Skip to content

Commit 0c01ccc

Browse files
[BugFix] fix double shutdown of comm group when rank0 clears weights slower than other ranks (PaddlePaddle#5715)
1 parent 5538dda commit 0c01ccc

1 file changed

Lines changed: 4 additions & 0 deletions

File tree

fastdeploy/rl/dynamic_weight_manager.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -272,10 +272,14 @@ def check_model_weights_status(model_weights_status, model_runner, pid, block):
272272
logger.info("infer engine stopped! start to load new checkpoint...")
273273
model_runner.clear_requests()
274274
model_runner.update_parameters(pid)
275+
while model_weights_status.value[0] != ModelWeightsStatus.NORMAL:
276+
time.sleep(0.01)
275277
logger.info("finished loading new checkpoint")
276278
elif model_weights_status.value[0] == ModelWeightsStatus.CLEARING:
277279
logger.info("infer engine stopped! start to clear checkpoint...")
278280
model_runner.clear_requests()
279281
model_runner.clear_parameters(pid)
282+
while model_weights_status.value[0] != ModelWeightsStatus.CLEARED:
283+
time.sleep(0.01)
280284
logger.info("finished clearing checkpoint")
281285
time.sleep(0.01)

0 commit comments

Comments
 (0)