Fix LocalBackend fork to load forked LoRA on both vLLM and trainer

arcticfly · arcticfly · commit 0d53531b1b9d · 2026-04-14T13:44:55.000-07:00
Two fixes after _experimental_fork_checkpoint copies the checkpoint: 1. Overwrite checkpoints/0000 with the forked weights so vLLM loads the correct adapter on startup (it uses @0 by default). 2. Invalidate the UnslothService _state cache so the trainer re-initializes with the forked checkpoint path instead of the base model.
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -1434,15 +1434,32 @@ async def _experimental_fork_checkpoint(
 
         shutil.copytree(source_checkpoint_dir, dest_checkpoint_dir)
 
-        # Also overwrite the initial empty checkpoint at step 0 so vLLM
-        # loads the forked weights on startup (it uses @0 by default)
+        # Overwrite the initial empty checkpoint at step 0 so both vLLM
+        # (which loads @0) and the Unsloth trainer (which may have already
+        # cached _state from the empty checkpoint) pick up the forked weights.
         step0_dir = get_step_checkpoint_dir(dest_model_dir, 0)
         if os.path.exists(step0_dir) and step0_dir != dest_checkpoint_dir:
             if verbose:
-                print(f"Overwriting initial checkpoint at {step0_dir} with forked weights")
+                print(
+                    f"Overwriting initial checkpoint at {step0_dir} with forked weights"
+                )
             shutil.rmtree(step0_dir)
             shutil.copytree(dest_checkpoint_dir, step0_dir)
 
+        # Invalidate the UnslothService _state cache so the trainer
+        # re-initializes with the forked checkpoint instead of the base model.
+        try:
+            service = await self._get_service(cast(TrainableModel, model))
+            if "_state" in service.__dict__:
+                del service.__dict__["_state"]
+                if verbose:
+                    print(
+                        "Invalidated UnslothService _state cache "
+                        "to pick up forked checkpoint"
+                    )
+        except Exception:
+            pass
+
         if verbose:
             print(
                 f"Successfully forked checkpoint from {from_model} (step {selected_step}) to {model.name}"