Load forked adapter weights on first training call

arcticfly · arcticfly · commit 5489fde7d55e · 2026-04-13T16:14:31.000-07:00
After _experimental_fork_checkpoint, store the checkpoint path on
the service. On the first _train_dedicated/_train_shared call, load
the adapter weights via load_lora_adapter before training begins.

This is needed because create_unsloth_train_context may initialize
the LoRA architecture from adapter_config.json without loading the
actual trained weights from adapter_model.safetensors, especially
when the checkpoint was trained at a different precision than the
current load config.
diff --git a/src/art/local/backend.py b/src/art/local/backend.py
@@ -1434,18 +1434,24 @@ async def _experimental_fork_checkpoint(
 
         shutil.copytree(source_checkpoint_dir, dest_checkpoint_dir)
 
-        # Invalidate the UnslothService _state cache so the trainer
-        # re-initializes with the forked checkpoint instead of the base model.
-        # _state is a cached_property that reads get_last_checkpoint_dir() on
-        # first access; if it was accessed before the fork, it cached the base
-        # model and will never pick up the forked weights.
+        # Ensure the trainer picks up the forked LoRA weights.
+        #
+        # 1. Invalidate the _state cache so create_unsloth_train_context
+        #    re-initializes with the forked checkpoint path.
+        #
+        # 2. Store the forked checkpoint path so the first training call can
+        #    explicitly load the adapter weights via load_lora_adapter. This
+        #    is necessary because from_pretrained may set up the LoRA
+        #    architecture without loading the actual trained weights
+        #    (especially across precision mismatches).
         service = await self._get_service(cast(TrainableModel, model))
         if hasattr(service, "_state") and "_state" in service.__dict__:
             del service.__dict__["_state"]
             if verbose:
                 print(
                     "Invalidated UnslothService _state cache to pick up forked checkpoint"
                 )
+        service._forked_checkpoint_dir = dest_checkpoint_dir
 
         if verbose:
             print(
diff --git a/src/art/unsloth/service.py b/src/art/unsloth/service.py
@@ -588,6 +588,11 @@ async def _train_dedicated(
         verbose: bool = False,
     ) -> AsyncIterator[dict[str, float]]:
         """Train in dedicated mode — no sleep/wake, vLLM keeps running on separate GPU."""
+        # Load forked adapter weights on first training call if needed.
+        forked_dir = getattr(self, "_forked_checkpoint_dir", None)
+        if forked_dir is not None:
+            del self._forked_checkpoint_dir
+            await self._state.load_lora_adapter(forked_dir)
         async for result in run_unsloth_rl_training(
             self._state,
             disk_packed_tensors=disk_packed_tensors,
@@ -629,6 +634,11 @@ async def _train_shared(
         verbose: bool = False,
     ) -> AsyncIterator[dict[str, float]]:
         """Train in shared mode — sleep/wake cycle with in-process vLLM."""
+        # Load forked adapter weights on first training call if needed.
+        forked_dir = getattr(self, "_forked_checkpoint_dir", None)
+        if forked_dir is not None:
+            del self._forked_checkpoint_dir
+            await self._state.load_lora_adapter(forked_dir)
         llm = await self.llm
 
         # Pause generation to prevent new requests during training