[BugFix] fix mm revert bug (PaddlePaddle#6061)

kevincheng2 · web-flow · commit 0e0eaa1c575d · 2026-01-16T08:13:34.000-08:00
* fix mm revert bug

* update code
diff --git a/fastdeploy/engine/sched/resource_manager_v1.py b/fastdeploy/engine/sched/resource_manager_v1.py
@@ -402,10 +402,15 @@ def revert_chunked_mm_input(self, mm_inputs, matched_token_num):
                     position.offset // self.config.cache_config.block_size
                 ) * self.config.cache_config.block_size
                 position_idx -= 1
-            elif matched_token_num < position.offset:
+            elif matched_token_num <= position.offset:
                 position_idx -= 1
             elif matched_token_num >= position.offset + position.length:
                 break
+            else:
+                llm_logger.error(
+                    f"revert_chunked_mm_input error, matched_token_num:{matched_token_num} position:{position}, {mm_inputs['mm_positions']}"
+                )
+                break
         return matched_token_num
 
     def _get_num_new_tokens(self, request, token_budget):
@@ -454,6 +459,18 @@ def _compute_audio_prefix_count(end_idx, end_patch_idx):
                 start_patch_idx = inputs["patch_idx"][-1]
             else:
                 start_patch_idx = inputs["patch_idx"][pre_end_idx]
+                if (
+                    pre_end_idx > 0
+                    and request.prompt_token_ids[pre_end_idx]
+                    in [
+                        inputs["image_patch_id"],
+                        inputs["video_patch_id"],
+                        inputs["audio_patch_id"],
+                    ]
+                    and request.prompt_token_ids[pre_end_idx] != request.prompt_token_ids[pre_end_idx - 1]
+                ):
+                    # It just hit the starting position of the image / video / audio
+                    start_patch_idx -= 1
             start_patch_map = inputs["patch_map"][start_patch_idx]
             request.image_start = start_patch_map["image_num"]
             request.video_start = start_patch_map["video_num"]
diff --git a/tests/v1/test_resource_manager_v1.py b/tests/v1/test_resource_manager_v1.py
@@ -284,6 +284,15 @@ def test_revert_chunked_mm_input_after_last_chunk(self):
         result = self.manager.revert_chunked_mm_input(mm_inputs, 256)
         self.assertEqual(result, 256)
 
+    def test_revert_chunked_mm_input_match_image_offset(self):
+        mm_inputs = {
+            "mm_positions": [
+                ImagePosition(offset=64, length=21),
+            ]
+        }
+        result = self.manager.revert_chunked_mm_input(mm_inputs, 64)
+        self.assertEqual(result, 64)
+
 
 if __name__ == "__main__":
     unittest.main()